Use LxmertForPreTraining instead of LxmertModel:
###Colab commands
#pip install transformers
#!git clone https://github.com/huggingface/transformers
#cd transformers
#cd examples/research_projects/lxmert
#pip install wget
from IPython.display import clear_output, Image, display
import PIL.Image
import io
import json
import torch
import numpy as np
from processing_image import Preprocess
from visualizing_image import SingleImageViz
from modeling_frcnn import GeneralizedRCNN
from utils import Config
import utils
import wget
import pickle
import os
import cv2
from copy import deepcopy
torch.cuda.is_available()
URL = "https://github.com/jacobgil/vit-explain/raw/main/examples/both.png"
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)
# run frcnn
images, sizes, scales_yx = image_preprocess(URL)
output_dict = frcnn(
images,
sizes,
scales_yx=scales_yx,
padding="max_detections",
max_detections=frcnn_cfg.max_detections,
return_tensors="pt",
)
# Very important that the boxes are normalized
normalized_boxes = output_dict.get("normalized_boxes")
features = output_dict.get("roi_features")
from transformers import LxmertTokenizer, LxmertForPreTraining
import torch
tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
text_sentence = "dog and cat are in the room and " + tokenizer.mask_token + " is laying on the ground"
inputs = tokenizer(text_sentence, return_token_type_ids=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt")
visual_feats = features
visual_attention_mask = torch.ones(features.shape[:-1], dtype=torch.long)
visual_pos=normalized_boxes
inputs.update(
{
"visual_feats": visual_feats,
"visual_pos": visual_pos,
"visual_attention_mask": visual_attention_mask,
}
)
model_outputs = model(**inputs, output_attentions=True)
model_outputs.keys()
Output:
odict_keys(['prediction_logits', 'cross_relationship_score', 'question_answering_score', 'language_attentions', 'vision_attentions', 'cross_encoder_attentions'])
P.S.: You can control the pertaining task heads via the configuration fields task_matched
, task_mask_lm
, task_obj_predict
, and task_qa
. I assume you are only interested in mask_lm
following your comment. That means you should initialize your model as follows:
from transformers import LxmertConfig, LxmertForPreTraining
config = LxmertConfig.from_pretrained("unc-nlp/lxmert-base-uncased")
config.task_matched = False
config.task_obj_predict=False
config.task_qa= False
model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased", config=config)