前提・実現したいこと
Python
OpenCVのcv2.VideoCapture(0)
を用いてONNXモデルに変換したYOLOv5にカメラ映像を入力して推論させたいです.
画像ファイルを指定するとその画像の物体が何か,座標はどこかというのは出力されますが,カメラ入力はできません.
(文字数制限で投稿できないので一部の関数は削除しています.)
発生している問題・エラーメッセージ
terminal
python3 onnx_yolo5.py Traceback (most recent call last): onnx_yolo5.py", line 294, in main image_data = preprocess(img) line 48, in preprocess boxed_image = letterbox_image(img, tuple(reversed(model_image_size))) , line 35, in letterbox_image iw, ih = image.size TypeError: cannot unpack non-iterable int object
該当のソースコード
Python
import PIL import numpy as np import time from PIL import Image, ImageDraw, ImageFont import cv2 import sys import onnxruntime models_path = './models/yolov5s.onnx' image_file_path = './images/neko.jpg' class_index = [] classes_path = './models/coco_classes.txt' with open(classes_path) as f: class_index = f.readlines() def letterbox_image(image, size): iw, ih = image.size w, h = size scale = min(w / iw, h / ih) nw = int(iw * scale) nh = int(ih * scale) image = image.resize((nw, nh), Image.BICUBIC) new_image = Image.new('RGB', size, (128,128,128)) new_image.paste(image, ((w - nw) // 2, (h - nh) // 2)) return new_image def preprocess(img): model_image_size = (640, 640) boxed_image = letterbox_image(img, tuple(reversed(model_image_size))) image_data = np.array(boxed_image, dtype='float32') image_data /= 255. image_data = np.transpose(image_data, [2, 0, 1]) image_data = np.expand_dims(image_data, 0) return image_data def bbox_iou(box1, box2): xs1 = max(box1[0], box2[0]) ys1 = max(box1[1], box2[1]) xs2 = min(box1[0] + box1[2], box2[0] + box2[2]) ys2 = min(box1[1] + box1[3], box2[1] + box2[3]) intersections = max(ys2 - ys1, 0) * max(xs2 - xs1, 0) unions = (box1[2] * box1[3]) + (box2[2] * box2[3]) - intersections return intersections / unions def box_iou(box1, box2): def box_area(box): return (box[2] - box[0]) * (box[3] - box[1]) area1 = box_area(box1.T) area2 = box_area(box2.T) inter = (np.min(box1[:, None, 2:], box2[:, 2:]) - np.max(box1[:, None, :2], box2[:, :2])).clip(0).prod(2) return inter / (area1[:, None] + area2 - inter) # iou = inter / (area1 + area2 - inter) def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=(), max_det=300): nc = prediction.shape[2] - 5 # number of classes xc = prediction[..., 4] > conf_thres # candidates min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height max_nms = 30000 # maximum number of boxes into torchvision.ops.nms() time_limit = 10.0 # seconds to quit after redundant = True # require redundant detections multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img) merge = False # use merge-NMS t = time.time() output = [np.zeros((0, 6))] * prediction.shape[0] for xi, x in enumerate(prediction): # image index, image inference x = x[xc[xi]] # confidence if labels and len(labels[xi]): l = labels[xi] v = np.zeros(len(l), nc + 5) v[:, :4] = l[:, 1:5] # box v[:, 4] = 1.0 # conf v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls x = np.concatenate((x, v), 0) if not x.shape[0]: continue x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf box = xywh2xyxy(x[:, :4]) if multi_label: i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T x = np.concatenate((box[i], x[i, j + 5, None], j[:, None].float()), 1) else: # best class only conf = np.max(x[:, 5:], 1, keepdims=True) j = np.argmax(x[:, 5:], axis=1) j = j.reshape((j.shape[0],1)) x = np.concatenate((box, conf, j), 1)[conf.reshape(-1) > conf_thres] if classes is not None: x = x[(x[:, 5:6] == np.array(classes)).any(1)] n = x.shape[0] # number of boxes if not n: # no boxes continue elif n > max_nms: # excess boxes x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence c = x[:, 5:6] * (0 if agnostic else max_wh) # classes boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores i = nms(boxes, scores, iou_thres) # NMS ### ここが問題, indexを返すようにしたい if i.shape[0] > max_det: # limit detections i = i[:max_det] if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean) iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix weights = iou * scores[None] # box weights x[i, :4] = np.dot(weights, x[:, :4]).float() / weights.sum(1, keepdims=True) # merged boxes if redundant: i = i[iou.sum(1) > 1] # require redundancy output[xi] = x[i] if (time.time() - t) > time_limit: print(f'WARNING: NMS time limit {time_limit}s exceeded') break return output def draw_bbox(det, img): font = ImageFont.truetype(font='font/FiraMono-Medium.otf', size=np.floor(3e-2 * img.size[1] + 0.5).astype('int32')) thickness = (img.size[0] + img.size[1]) // 300 print(type(img)) for *xyxy, conf, cls in reversed(det): pre_class , score, box = class_index[int(cls)] , conf, get_xywh(xyxy, np.array(img)[:, :, ::-1]) label = '{} {:.2f}'.format(pre_class, score) draw = ImageDraw.Draw(img) label_size = draw.textsize(label, font) x, y, w, h = box print(label, (x, y), (w, h)) if y - label_size[1] >= 0: text_origin = np.array([x, y - label_size[1]]) else: text_origin = np.array([x, y + 1]) for i in range(thickness): draw.rectangle([x + i, y + i, (x+w) - i, (y+h) - i],outline=(255,0,0)) draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=(0,255,0)) draw.text(text_origin, label, fill=(0, 0, 0), font=font) del draw def main(): cap = cv2.VideoCapture(0) while True: ret, img = cap.read() image_data = preprocess(img) session = onnxruntime.InferenceSession(models_path) for session_input in session.get_inputs(): input_name = session_input.name pred = np.array(session.run([session.get_outputs()[0].name], {input_name:image_data})) pred = non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, max_det=1000) # Process predictions for i, det in enumerate(pred): # detections per image im0 = np.copy(img)[:, :, ::-1] # cv2型に変換 if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(image_data.shape[2:], det[:, :4], im0.shape).round() draw_bbox(det, img) cv2.imshow('inference',img) if cv2.waitKey(1) & 0xFF == ord('q'): break cap.release() cv2.destroyAllWindows() if __name__ == '__main__': main()
試したこと
画像パスから画像を読み込んでいるので,VideoCaptureから得られたframeを画像として入力しましたがダメでした.
まだ回答がついていません
会員登録して回答してみよう