前提・実現したいこと
卒業研究でVisuallBERTを使っているのですが,モデルの前にfrcnnを以下のように書いても乗りません.
設定ファイルを変更するようなのですが,わからすに困っています.
変更点
python
1device = "cuda:0" 2frcnn = frcnn.to(device) 3images = images.to(device) 4sizes = sizes.to(device) 5frcnn = frcnn.to(device) 6 7output_dict = frcnn( 8 images, 9 sizes, 10 scales_yx=scales_yx, 11 padding="max_detections", 12 max_detections=frcnn_cfg.max_detections, 13 return_tensors="pt", 14 )
実行ファイル
python
1from IPython.display import Image, display 2import PIL.Image 3import io 4import torch 5import numpy as np 6from processing_image import Preprocess 7from visualizing_image import SingleImageViz 8from modeling_frcnn import GeneralizedRCNN 9from utils import Config 10import utils 11from transformers import VisualBertForQuestionAnswering, BertTokenizerFast 12 13 14img = 'dataset/training/image/abstract_v002_train2015_000000000000.png' 15 16 17OBJ_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt" 18ATTR_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt" 19VQA_URL = "https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt" 20 21# for visualizing output 22def showarray(a, fmt="png"): 23 a = np.uint8(np.clip(a, 0, 255)) 24 f = io.BytesIO() 25 PIL.Image.fromarray(a).save(f, fmt) 26 display(Image(data=f.getvalue())) 27 28# load object, attribute, and answer labels 29 30objids = utils.get_data(OBJ_URL) 31attrids = utils.get_data(ATTR_URL) 32vqa_answers = utils.get_data(VQA_URL) 33 34# load models and model components 35frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") 36 37frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) 38 39image_preprocess = Preprocess(frcnn_cfg) 40 41bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 42visualbert_vqa = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa") 43 44#visualbert_vqa = visualbert_vqa.to("cuda:0") 45# add boxes and labels to the image 46# add boxes and labels to the image 47''''' 48from tqdm import tqdm 49for i in tqdm(range(60000), total=60000): 50''' 51# image viz 52frcnn_visualizer = SingleImageViz(img, id2obj=objids, id2attr=attrids) 53# run frcnn 54images, sizes, scales_yx = image_preprocess(img) 55 56''''' 57device = "cuda:0" 58frcnn = frcnn.to(device) 59images = images.to(device) 60sizes = sizes.to(device) 61frcnn = frcnn.to(device) 62''' 63output_dict = frcnn( 64 images, 65 sizes, 66 scales_yx=scales_yx, 67 padding="max_detections", 68 max_detections=frcnn_cfg.max_detections, 69 return_tensors="pt", 70 ) 71 72frcnn_visualizer.draw_boxes( 73 output_dict.get("boxes"), 74 output_dict.pop("obj_ids"), 75 output_dict.pop("obj_probs"), 76 output_dict.pop("attr_ids"), 77 output_dict.pop("attr_probs"), 78) 79showarray(frcnn_visualizer._get_buffer()) 80 81test_questions_for_url2 = [ 82 "Who looks happier?", 83 "What is near the disk?", 84 "What is the color of the table?", 85 "What is the color of the cat?", 86 "What is the shape of the monitor?", 87] 88 89# Very important that the boxes are normalized 90# normalized_boxes = output_dict.get("normalized_boxes") 91features = output_dict.get("roi_features") 92 93for test_question in test_questions_for_url2: 94 test_question = [test_question] 95 96 inputs = bert_tokenizer( 97 test_question, 98 padding="max_length", 99 max_length=20, 100 truncation=True, 101 return_token_type_ids=True, 102 return_attention_mask=True, 103 add_special_tokens=True, 104 return_tensors="pt", 105 ) 106 107 output_vqa = visualbert_vqa( 108 input_ids=inputs.input_ids, 109 attention_mask=inputs.attention_mask, 110 visual_embeds=features, 111 visual_attention_mask=torch.ones(features.shape[:-1]), 112 token_type_ids=inputs.token_type_ids, 113 output_attentions=False, 114 ) 115 # get prediction 116 pred_vqa = output_vqa["logits"].argmax(-1) 117 print("Question:", test_question) 118 print("prediction from VisualBert VQA:", vqa_answers[pred_vqa])
動かし方
設定ファイル
あなたの回答
tips
プレビュー