YOLOv5で指定領域を切り取りたい

前提

現在，YOLOv5で自作のモデルを使って領域の検出をしたのですが，detect.pyを実行して，推論結果を保存するところまではできています．

実現したいこと

YOLOv5において推定された領域の中心座標から512*256の大きさに切り取るっためのコードに書き換えたいです

発生している問題

中心座標の抽出はxyxyの変数の中に格納されているのはわかるのですが，そこから先の切り取り方がわからず低迷中です．

ソースコード

detect.py
1
2from utils.torch_utils import select_device, time_sync
3from utils.plots import Annotator, colors, save_one_box
4from utils.general import (LOGGER, check_file, check_img_size, check_imshow, check_requirements, colorstr,
5                           increment_path, non_max_suppression, print_args, scale_coords, strip_optimizer, xyxy2xywh)
6from utils.datasets import IMG_FORMATS, VID_FORMATS, LoadImages, LoadStreams
7from models.common import DetectMultiBackend
8import argparse
9import os
10import sys
11from pathlib import Path
12import cv2
13import torch
14import torch.backends.cudnn as cudnn
15from PIL import Image
16
17FILE = Path(__file__).resolve()
18ROOT = FILE.parents[0]  # YOLOv5 root directory
19if str(ROOT) not in sys.path:
20    sys.path.append(str(ROOT))  # add ROOT to PATH
21ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
22
23
24@torch.no_grad()
25def run(weights=ROOT / 'yolov5s.pt',  # model.pt path(s)
26        source=ROOT / 'data/images',  # file/dir/URL/glob, 0 for webcam
27        data=ROOT / 'data/coco128.yaml',  # dataset.yaml path
28        imgsz=(640, 640),  # inference size (height, width)
29        conf_thres=0.25,  # confidence threshold
30        iou_thres=0.45,  # NMS IOU threshold
31        max_det=1000,  # maximum detections per image
32        device='',  # cuda device, i.e. 0 or 0,1,2,3 or cpu
33        view_img=False,  # show results
34        save_txt=False,  # save results to *.txt
35        save_conf=False,  # save confidences in --save-txt labels
36        save_crop=False,  # save cropped prediction boxes
37        ):
38    source = str(source)
39    save_img = not nosave and not source.endswith('.txt')  # save inference images
40    is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
41    is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))
42    webcam = source.isnumeric() or source.endswith('.txt') or (is_url and not is_file)
43    if is_url and is_file:
44        source = check_file(source)  # download
45
46    # Directories
47    save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  # increment run
48    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir
49
50    # Load model
51    device = select_device(device)
52    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data)
53    stride, names, pt, jit, onnx, engine = model.stride, model.names, model.pt, model.jit, model.onnx, model.engine
54    imgsz = check_img_size(imgsz, s=stride)  # check image size
55
56    # Half
57    half &= (pt or jit or engine) and device.type != 'cpu'  # half precision only supported by PyTorch on CUDA
58    if pt or jit:
59        model.model.half() if half else model.model.float()
60
61    # Dataloader
62    if webcam:
63        view_img = check_imshow()
64        cudnn.benchmark = True  # set True to speed up constant image size inference
65        dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
66        bs = len(dataset)  # batch_size
67    else:
68        dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)
69        bs = 1  # batch_size
70    vid_path, vid_writer = [None] * bs, [None] * bs
71
72    # Run inference
73    model.warmup(imgsz=(1, 3, *imgsz), half=half)  # warmup
74    dt, seen = [0.0, 0.0, 0.0], 0
75    for path, im, im0s, vid_cap, s in dataset:
76        t1 = time_sync()
77        im = torch.from_numpy(im).to(device)
78        im = im.half() if half else im.float()  # uint8 to fp16/32
79        im /= 255  # 0 - 255 to 0.0 - 1.0
80        if len(im.shape) == 3:
81            im = im[None]  # expand for batch dim
82        t2 = time_sync()
83        dt[0] += t2 - t1
84
85        # Inference
86        visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
87        pred = model(im, augment=augment, visualize=visualize)
88        t3 = time_sync()
89        dt[1] += t3 - t2
90
91        # NMS
92        pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
93        dt[2] += time_sync() - t3
94
95        # Second-stage classifier (optional)
96        # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)
97
98        # Process predictions
99        for i, det in enumerate(pred):  # per image
100            seen += 1
101            if webcam:  # batch_size >= 1
102                p, im0, frame = path[i], im0s[i].copy(), dataset.count
103                s += f'{i}: '
104            else:
105                p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)
106
107            p = Path(p)  # to Path
108            save_path = str(save_dir / p.name)  # im.jpg
109            txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # im.txt
110            s += '%gx%g ' % im.shape[2:]  # print string
111            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
112            imc = im0.copy() if save_crop else im0  # for save_crop
113            annotator = Annotator(im0, line_width=line_thickness, example=str(names))
114            if len(det):
115                # Rescale boxes from img_size to im0 size
116                det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()
117
118                # Print results
119                for c in det[:, -1].unique():
120                    n = (det[:, -1] == c).sum()  # detections per class
121                    s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string
122
123                # Write results
124                for *xyxy, conf, cls in reversed(det):
125                    if save_txt:  # Write to file
126                        xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
127                        line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
128                        with open(txt_path + '.txt', 'a') as f:
129                            f.write(('%g ' * len(line)).rstrip() % line + '\n')
130
131                    if save_img or save_crop or view_img:  # Add bbox to image
132                        c = int(cls)  # integer class
133                        label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
134                        annotator.box_label(xyxy, label, color=colors(c, True))
135                        if save_crop:
136                            # save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True)
137
138ここだけ自分で書き換えてるところです----
139                            print(xyxy)
140                            print(
141                                [((xyxy[1] + xyxy[3]) / 2) - 256, ((xyxy[2] + xyxy[4]) / 2) - 128, ((xyxy[1] + xyxy[3]) / 2) + 256, ((xyxy[2] + xyxy[4]) / 2) - 128])
142
143                            pix2pix_crop = imc.crop[((xyxy[1] + xyxy[3]) / 2) - 256: ((xyxy[2] + xyxy[4]) / 2) -
144                                                    128, ((xyxy[1] + xyxy[3]) / 2) + 256: ((xyxy[2] + xyxy[4]) / 2) - 128]
145                            cv2.imwrite("runs/detect/runs/detect/crops/sample.png", pix2pix_crop)
146-----------------
147            # Print time (inference-only)
148            LOGGER.info(f'{s}Done. ({t3 - t2:.3f}s)')
149
150            # Stream results
151            im0 = annotator.result()
152            if view_img:
153                cv2.imshow(str(p), im0)
154                cv2.waitKey(1)  # 1 millisecond
155
156            # Save results (image with detections)
157            if save_img:
158                if dataset.mode == 'image':
159                    cv2.imwrite(save_path, im0)```ここに言語を入力
160
161    # Print results
162    t = tuple(x / seen * 1E3 for x in dt)  # speeds per image
163    LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
164    if save_txt or save_img:
165        s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
166        LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}{s}")
167    if update:
168        strip_optimizer(weights)  # update model (to fix SourceChangeWarning)
169
170
171def parse_opt():
172    parser = argparse.ArgumentParser()
173    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model path(s)')
174    parser.add_argument('--source', type=str, default=ROOT / 'data/images',
175                        help='file/dir/URL/glob, 0 for webcam')
176    parser.add_argument('--data', type=str, default=ROOT /
177                        'data/coco128.yaml', help='(optional) dataset.yaml path')
178    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+',
179                        type=int, default=[640], help='inference size h,w')
180    parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold')
181    parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
182    parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')
183    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
184    parser.add_argument('--view-img', action='store_true', help='show results')
185    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
186    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
187    parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')
188
189    opt = parser.parse_args()
190    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
191    print_args(FILE.stem, opt)
192    return opt
193
194
195def main(opt):
196    check_requirements(exclude=('tensorboard', 'thop'))
197    run(**vars(opt))
198
199
200if __name__ == "__main__":
201    opt = parse_opt()
202    main(opt)
203

試したこと

いろいろなサイトを見て引用している型や書き方が違うのはわかるのですが，いまいち理解できておらず中途半端なソースコードになってしまっています．

補足

コードの途中に自分の書き替えている部分があります．

jbpb0

2022/10/16 02:38 編集

質問のコードを実行したら、どうなるのでしょうか？ (要望してるようにならないので質問してるのだろうとは思いますが)

asuuu

2022/10/16 02:42

実行すると作成したモデルで推定領域は切り取りはできますエラーというより書き方がわからないという感じなので，実現したいことの内容の書き方を知りたいです

meg_

2022/10/16 02:52 編集

> 低迷中です．現在のコードの実行結果（保存した画像）はどうなっているのでしょうか？　⇒　質問が前後してしまいました。すみません。 > 実行すると作成したモデルで推定領域は切り取りはできますやりたいことが実現できているなら問題ないと思います。現在の質問の書き方ですと実現できていないように質問を見た人は思うので、コードレビューであればそう書いた方が良いかと思います。

jbpb0

2022/10/17 02:00

下記が気になりました切り出し領域の中心が画像の上下左右端に近い場合は、 > 領域の中心座標から512*256の大きさに切り取るが元画像の外にはみ出るが、その場合は大丈夫なのか？「(xyxy[1] + xyxy[3])」や「(xyxy[2] + xyxy[4])」が奇数の場合は、「((xyxy[1] + xyxy[3]) / 2)」や「((xyxy[2] + xyxy[4]) / 2)」が整数にはならないけど、その場合は大丈夫なのか？

jbpb0

2022/10/17 02:02

あと、足したり引いたりする「256」と「128」をコードの「imc.crop[...」のところに書くより、コードの先頭近くとかの分かりやすいところに > 領域の中心座標から512*256の大きさに切り取るの「512」と「256」を書いて、それから「imc.crop[...」のところで使う数値を計算するようにした方が、後で「512」と「256」を変えたくなった場合に修正が容易になると思います

行動規範の内容に同意します

回答1件

実行すると作成したモデルで推定領域は切り取りはできます
エラーというより書き方がわからないという感じなので，実現したいことの内容の書き方を知りたいです

下記が気になりました
・切り出し領域の中心が画像の上下左右端に近い場合は「領域の中心座標から512*256の大きさに切り取る」が元画像の外にはみ出るが、その場合は大丈夫なのか？
・「(xyxy[1] + xyxy[3])」や「(xyxy[2] + xyxy[4])」が奇数の場合は、「((xyxy[1] + xyxy[3]) / 2)」や「((xyxy[2] + xyxy[4]) / 2)」が整数にはならないけど、その場合は大丈夫なのか？

あと、足したり引いたりする「256」と「128」をコードの「imc.crop[...」のところに書くより、コードの先頭近くとかの分かりやすいところに「領域の中心座標から512*256の大きさに切り取る」の「512」と「256」を書いて、それから「imc.crop[...」のところで使う数値を計算するようにした方が、後で「512」と「256」を変えたくなった場合に修正が容易になると思います

投稿2022/11/03 05:48

jbpb0

総合スコア7658