Inference on Visual Commonsense Reasoning (VCR)

March 9, 2023 · View on GitHub

Dataset Preparation for VCR

Download the VCR dataset from VCR official site, including Annotations and Images. Unzip the downloaded file.

Inference with VCR pipeline

Here are two examples of using fine-tuned VLEForVCR Q2A and QA2R models to infer on a VCR sample. The sample's image is placed in vcr_sample_images/ as follows, and the annotation meta_data is taken from VCR validation set.

vcr_sample_images
└──lsmdc_1054_Harry_Potter_and_the_prisoner_of_azkaban
    ├──1054_Harry_Potter_and_the_prisoner_of_azkaban_00.01.46.736-00.01.50.168_0.jpg
    └──1054_Harry_Potter_and_the_prisoner_of_azkaban_00.01.46.736-00.01.50.168_0.json

VCR Q2A

from models.VLE import VLEForVCRQ2A, VLEProcessor, VLEForVCRQ2APipeline

model_name = 'hfl/vle-large-for-vcr-q2a'
model = VLEForVCRQ2A.from_pretrained(model_name)
vle_processor = VLEProcessor.from_pretrained(model_name)
vcr_q2a_pipeline = VLEForVCRQ2APipeline(model=model, device='cpu', vle_processor=vle_processor)

vcr_image_root = 'examples/VCR/vcr_sample_images'
meta_data = {"movie": "1054_Harry_Potter_and_the_prisoner_of_azkaban", "objects": ["person", "person", "person", "car", "cellphone", "clock"], "interesting_scores": [-1, 0], "answer_likelihood": "possible", "img_fn": "lsmdc_1054_Harry_Potter_and_the_prisoner_of_azkaban/1054_Harry_Potter_and_the_prisoner_of_azkaban_00.01.46.736-00.01.50.168@0.jpg", "metadata_fn": "lsmdc_1054_Harry_Potter_and_the_prisoner_of_azkaban/1054_Harry_Potter_and_the_prisoner_of_azkaban_00.01.46.736-00.01.50.168@0.json", "answer_orig": "No, 1 is a visitor.", "question_orig": "Does 1 live in this house?", "rationale_orig": "1 is wearing outerwear, holding an umbrella, and there is a car outside.", "question": ["Does", [0], "live", "in", "this", "house", "?"], "answer_match_iter": [2, 3, 0, 1], "answer_sources": [10104, 5332, 1, 16646], "answer_choices": [["No", ",", [0], "lives", "nowhere", "close", "."], ["Yes", ",", [0], "works", "there", "."], ["No", ",", [0], "is", "a", "visitor", "."], ["No", [1], "does", "not", "belong", "here", "."]], "answer_label": 2, "rationale_choices": [[[0], "is", "nicely", "dressed", "with", "a", "tie", ".", "people", "dress", "up", "when", "they", "visit", "someone", "else", "."], [[2], "sits", "comfortably", "in", "a", "chair", ",", "reading", "papers", ",", "while", "it", "seems", [0], "has", "just", "arrived", "and", "is", "settling", "in", "."], [[1], "is", "wearing", "a", "coat", "and", "muff", "and", "is", "sitting", "as", "if", "a", "visitor", "."], [[0], "is", "wearing", "outerwear", ",", "holding", "an", "umbrella", ",", "and", "there", "is", "a", "car", "outside", "."]], "rationale_sources": [26162, 12999, 6661, 1], "rationale_match_iter": [1, 3, 2, 0], "rationale_label": 3, "img_id": "val-0", "question_number": 1, "annot_id": "val-1", "match_fold": "val-0", "match_index": 1}

vcr_outputs = vcr_q2a_pipeline(vcr_image_root=vcr_image_root, meta_inputs=meta_data)
pred = vcr_outputs[0]["pred"]
print(f'Q: {meta_data["question"]}')
print(f'A1: {meta_data["answer_choices"][0]}')
print(f'A2: {meta_data["answer_choices"][1]}')
print(f'A3: {meta_data["answer_choices"][2]}')
print(f'A4: {meta_data["answer_choices"][3]}')
print(f'Label: {meta_data["answer_label"] + 1}')
print(f'predict: {pred[0] + 1}')

VCR QA2R

from models.VLE import VLEForVCRQA2R, VLEProcessor, VLEForVCRQA2RPipeline

model_name = 'hfl/vle-large-for-vcr-qa2r'
model = VLEForVCRQA2R.from_pretrained(model_name)
vle_processor = VLEProcessor.from_pretrained(model_name)
vcr_qa2r_pipeline = VLEForVCRQA2RPipeline(model=model, device='cpu', vle_processor=vle_processor)

vcr_image_root = 'examples/VCR/vcr_sample_images'
meta_data = {"movie": "1054_Harry_Potter_and_the_prisoner_of_azkaban", "objects": ["person", "person", "person", "car", "cellphone", "clock"], "interesting_scores": [-1, 0], "answer_likelihood": "possible", "img_fn": "lsmdc_1054_Harry_Potter_and_the_prisoner_of_azkaban/1054_Harry_Potter_and_the_prisoner_of_azkaban_00.01.46.736-00.01.50.168@0.jpg", "metadata_fn": "lsmdc_1054_Harry_Potter_and_the_prisoner_of_azkaban/1054_Harry_Potter_and_the_prisoner_of_azkaban_00.01.46.736-00.01.50.168@0.json", "answer_orig": "No, 1 is a visitor.", "question_orig": "Does 1 live in this house?", "rationale_orig": "1 is wearing outerwear, holding an umbrella, and there is a car outside.", "question": ["Does", [0], "live", "in", "this", "house", "?"], "answer_match_iter": [2, 3, 0, 1], "answer_sources": [10104, 5332, 1, 16646], "answer_choices": [["No", ",", [0], "lives", "nowhere", "close", "."], ["Yes", ",", [0], "works", "there", "."], ["No", ",", [0], "is", "a", "visitor", "."], ["No", [1], "does", "not", "belong", "here", "."]], "answer_label": 2, "rationale_choices": [[[0], "is", "nicely", "dressed", "with", "a", "tie", ".", "people", "dress", "up", "when", "they", "visit", "someone", "else", "."], [[2], "sits", "comfortably", "in", "a", "chair", ",", "reading", "papers", ",", "while", "it", "seems", [0], "has", "just", "arrived", "and", "is", "settling", "in", "."], [[1], "is", "wearing", "a", "coat", "and", "muff", "and", "is", "sitting", "as", "if", "a", "visitor", "."], [[0], "is", "wearing", "outerwear", ",", "holding", "an", "umbrella", ",", "and", "there", "is", "a", "car", "outside", "."]], "rationale_sources": [26162, 12999, 6661, 1], "rationale_match_iter": [1, 3, 2, 0], "rationale_label": 3, "img_id": "val-0", "question_number": 1, "annot_id": "val-1", "match_fold": "val-0", "match_index": 1}

vcr_outputs = vcr_qa2r_pipeline(vcr_image_root=vcr_image_root, meta_inputs=meta_data)
pred = vcr_outputs[0]["pred"]
print(f'Q: {meta_data["question"]}')
print(f'A: {meta_data["answer_choices"][meta_data["answer_label"]]}')
print(f'R1: {meta_data["rationale_choices"][0]}')
print(f'R2: {meta_data["rationale_choices"][1]}')
print(f'R3: {meta_data["rationale_choices"][2]}')
print(f'R4: {meta_data["rationale_choices"][3]}')
print(f'Label: {meta_data["rationale_label"] + 1}')
print(f'predict: {pred[0] + 1}')