ONNXモデルに変換したYOLOv5でカメラを入力して推論させたい

前提・実現したいこと

Python
1OpenCVのcv2.VideoCapture(0)

を用いてONNXモデルに変換したYOLOv5にカメラ映像を入力して推論させたいです．
画像ファイルを指定するとその画像の物体が何か，座標はどこかというのは出力されますが，カメラ入力はできません．
(文字数制限で投稿できないので一部の関数は削除しています．)

発生している問題・エラーメッセージ

terminal
1python3 onnx_yolo5.py
2Traceback (most recent call last):
3 onnx_yolo5.py", line 294, in main
4    image_data = preprocess(img)
5 line 48, in preprocess
6    boxed_image = letterbox_image(img, tuple(reversed(model_image_size)))
7, line 35, in letterbox_image
8    iw, ih = image.size
9TypeError: cannot unpack non-iterable int object

該当のソースコード

Python
1import PIL
2import numpy as np
3import time
4
5from PIL import Image, ImageDraw, ImageFont
6import cv2
7
8import sys
9
10import onnxruntime
11
12
13models_path = './models/yolov5s.onnx'
14image_file_path = './images/neko.jpg'
15
16
17class_index = []
18classes_path = './models/coco_classes.txt'
19with open(classes_path) as f:
20    class_index = f.readlines()
21
22
23def letterbox_image(image, size):
24    iw, ih = image.size
25    w, h = size
26    scale = min(w / iw, h / ih)
27    nw = int(iw * scale)
28    nh = int(ih * scale)
29    image = image.resize((nw, nh), Image.BICUBIC)
30    new_image = Image.new('RGB', size, (128,128,128))
31    new_image.paste(image, ((w - nw) // 2, (h - nh) // 2))
32    return new_image
33
34def preprocess(img):
35    model_image_size = (640, 640)
36    boxed_image = letterbox_image(img, tuple(reversed(model_image_size)))
37    image_data = np.array(boxed_image, dtype='float32')
38    image_data /= 255.
39    image_data = np.transpose(image_data, [2, 0, 1])
40    image_data = np.expand_dims(image_data, 0)
41    return image_data
42
43def bbox_iou(box1, box2):
44    xs1 = max(box1[0], box2[0])
45    ys1 = max(box1[1], box2[1])
46    xs2 = min(box1[0] + box1[2], box2[0] + box2[2])
47    ys2 = min(box1[1] + box1[3], box2[1] + box2[3])
48
49    intersections = max(ys2 - ys1, 0) * max(xs2 - xs1, 0)
50    unions = (box1[2] * box1[3]) + (box2[2] * box2[3]) - intersections
51    return intersections / unions
52
53def box_iou(box1, box2):
54    
55    def box_area(box):
56        return (box[2] - box[0]) * (box[3] - box[1])
57
58    area1 = box_area(box1.T)
59    area2 = box_area(box2.T)
60
61
62    inter = (np.min(box1[:, None, 2:], box2[:, 2:]) - np.max(box1[:, None, :2], box2[:, :2])).clip(0).prod(2)
63    return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
64
65def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
66                        labels=(), max_det=300):
67    
68    nc = prediction.shape[2] - 5  # number of classes
69    xc = prediction[..., 4] > conf_thres  # candidates
70
71    
72    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
73    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
74    time_limit = 10.0  # seconds to quit after
75    redundant = True  # require redundant detections
76    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
77    merge = False  # use merge-NMS
78
79    t = time.time()
80    output = [np.zeros((0, 6))] * prediction.shape[0]
81    for xi, x in enumerate(prediction):  # image index, image inference
82        x = x[xc[xi]]  # confidence
83
84
85        if labels and len(labels[xi]):
86            l = labels[xi]
87            v = np.zeros(len(l), nc + 5)
88            v[:, :4] = l[:, 1:5]  # box
89            v[:, 4] = 1.0  # conf
90            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
91            x = np.concatenate((x, v), 0)
92
93        if not x.shape[0]:
94            continue
95
96       
97        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
98        
99        box = xywh2xyxy(x[:, :4])
100
101
102        if multi_label:
103            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
104            x = np.concatenate((box[i], x[i, j + 5, None], j[:, None].float()), 1)
105        else:  # best class only
106            conf = np.max(x[:, 5:], 1, keepdims=True)
107            j = np.argmax(x[:, 5:], axis=1)
108            j = j.reshape((j.shape[0],1))
109            x = np.concatenate((box, conf, j), 1)[conf.reshape(-1) > conf_thres]
110
111        if classes is not None:
112            x = x[(x[:, 5:6] == np.array(classes)).any(1)]
113
114        n = x.shape[0]  # number of boxes
115        if not n:  # no boxes
116            continue
117        elif n > max_nms:  # excess boxes
118            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
119
120        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
121        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
122        i = nms(boxes, scores, iou_thres)  # NMS　### ここが問題, indexを返すようにしたい
123        if i.shape[0] > max_det:  # limit detections
124            i = i[:max_det]
125        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
126            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
127            weights = iou * scores[None]  # box weights
128            x[i, :4] = np.dot(weights, x[:, :4]).float() / weights.sum(1, keepdims=True)  # merged boxes
129            if redundant:
130                i = i[iou.sum(1) > 1]  # require redundancy
131
132        output[xi] = x[i]
133        if (time.time() - t) > time_limit:
134            print(f'WARNING: NMS time limit {time_limit}s exceeded')
135            break
136
137    return output
138
139def draw_bbox(det, img):
140
141    font = ImageFont.truetype(font='font/FiraMono-Medium.otf',
142                size=np.floor(3e-2 * img.size[1] + 0.5).astype('int32'))
143    thickness = (img.size[0] + img.size[1]) // 300
144
145    print(type(img))
146
147    for *xyxy, conf, cls in reversed(det):
148        pre_class , score, box = class_index[int(cls)] , conf, get_xywh(xyxy, np.array(img)[:, :, ::-1])
149
150        label = '{} {:.2f}'.format(pre_class, score)
151        draw = ImageDraw.Draw(img)
152        label_size = draw.textsize(label, font)
153
154        x, y, w, h = box
155        print(label, (x, y), (w, h))
156
157        if y - label_size[1] >= 0:
158            text_origin = np.array([x, y - label_size[1]])
159        else:
160            text_origin = np.array([x, y + 1])
161
162        for i in range(thickness):
163            draw.rectangle([x + i, y + i, (x+w) - i, (y+h) - i],outline=(255,0,0))
164        draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=(0,255,0))
165        draw.text(text_origin, label, fill=(0, 0, 0), font=font)
166        del draw
167
168
169def main():
170    cap = cv2.VideoCapture(0)
171
172    while True:
173        ret, img = cap.read()
174
175        image_data = preprocess(img)
176
177        session = onnxruntime.InferenceSession(models_path)
178
179        for session_input in session.get_inputs():
180            input_name = session_input.name
181
182        pred = np.array(session.run([session.get_outputs()[0].name], {input_name:image_data}))
183        pred = non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, max_det=1000)
184
185        # Process predictions
186        for i, det in enumerate(pred):  # detections per image
187            im0 = np.copy(img)[:, :, ::-1] # cv2型に変換
188            if len(det):
189                # Rescale boxes from img_size to im0 size
190                det[:, :4] = scale_coords(image_data.shape[2:], det[:, :4], im0.shape).round()
191                draw_bbox(det, img)
192
193        cv2.imshow('inference',img)
194        if cv2.waitKey(1) & 0xFF == ord('q'):
195            break
196
197    cap.release()
198    cv2.destroyAllWindows()
199
200if __name__ == '__main__':
201    main()