SSD(Keras/TensorFlow)で映像検出について

https://qiita.com/yampy/items/37c607fdf77a919cda5d

上記の記事を参考にしてSSDをやってみたのですがエラー文が出てしまいます。
どうすれば解決するのかわかりません。
動画はmp4です。
よろしくお願いします。

ValueError: "concat" mode can only merge layers with matching output shapes except for the concat axis. Layer shapes: [(None, 38, 38, 512), (None, 19, 19, 1024), (None, 10, 10, 512), (None, 5, 5, 256), (None, 3, 3, 256), (None, 1, 1, 256)]

python
1""" A class for testing a SSD model on a video file or webcam """
2
3import cv2
4import keras
5from keras.applications.imagenet_utils import preprocess_input
6from keras.backend.tensorflow_backend import set_session
7from keras.models import Model
8from keras.preprocessing import image
9import pickle
10import numpy as np
11from random import shuffle
12from scipy.misc import imread, imresize
13from timeit import default_timer as timer
14
15import sys
16sys.path.append("..")
17from ssd_utils import BBoxUtility
18
19
20class VideoTest(object):
21    """ Class for testing a trained SSD model on a video file and show the
22        result in a window. Class is designed so that one VideoTest object
23        can be created for a model, and the same object can then be used on
24        multiple videos and webcams.
25
26        Arguments:
27            class_names: A list of strings, each containing the name of a class.
28                         The first name should be that of the background class
29                         which is not used.
30
31            model:       An SSD model. It should already be trained for
32                         images similar to the video to test on.
33
34            input_shape: The shape that the model expects for its input,
35                         as a tuple, for example (300, 300, 3)
36
37            bbox_util:   An instance of the BBoxUtility class in ssd_utils.py
38                         The BBoxUtility needs to be instantiated with
39                         the same number of classes as the length of
40                         class_names.
41
42    """
43
44    def __init__(self, class_names, model, input_shape):
45        self.class_names = class_names
46        self.num_classes = len(class_names)
47        self.model = model
48        self.input_shape = input_shape
49        self.bbox_util = BBoxUtility(self.num_classes)
50
51        # Create unique and somewhat visually distinguishable bright
52        # colors for the different classes.
53        self.class_colors = []
54        for i in range(0, self.num_classes):
55            # This can probably be written in a more elegant manner
56            hue = 255*i/self.num_classes
57            col = np.zeros((1,1,3)).astype("uint8")
58            col[0][0][0] = hue
59            col[0][0][1] = 128 # Saturation
60            col[0][0][2] = 255 # Value
61            cvcol = cv2.cvtColor(col, cv2.COLOR_HSV2BGR)
62            col = (int(cvcol[0][0][0]), int(cvcol[0][0][1]), int(cvcol[0][0][2]))
63            self.class_colors.append(col)
64
65    def run(self, video_path = 0, start_frame = 0, conf_thresh = 0.6):
66        """ Runs the test on a video (or webcam)
67
68        # Arguments
69        video_path: A file path to a video to be tested on. Can also be a number,
70                    in which case the webcam with the same number (i.e. 0) is
71                    used instead
72
73        start_frame: The number of the first frame of the video to be processed
74                     by the network.
75
76        conf_thresh: Threshold of confidence. Any boxes with lower confidence
77                     are not visualized.
78
79        """
80
81        vid = cv2.VideoCapture(video_path)
82        if not vid.isOpened():
83            raise IOError(("Couldn't open video file or webcam. If you're "
84            "trying to open a webcam, make sure you video_path is an integer!"))
85
86        # Compute aspect ratio of video
87        vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH)
88        vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT)
89        vidar = vidw/vidh
90
91        # Skip frames until reaching start_frame
92        if start_frame > 0:
93            vid.set(cv2.cv.CV_CAP_PROP_POS_MSEC, start_frame)
94
95        accum_time = 0
96        curr_fps = 0
97        fps = "FPS: ??"
98        prev_time = timer()
99        num_frame = 0
100        while True:
101            retval, orig_image = vid.read()
102            if not retval:
103                print("Done!")
104                return
105
106            im_size = (self.input_shape[0], self.input_shape[1])
107            resized = cv2.resize(orig_image, im_size)
108            rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
109
110            # Reshape to original aspect ratio for later visualization
111            # The resized version is used, to visualize what kind of resolution
112            # the network has to work with.
113            to_draw = cv2.resize(resized, (int(self.input_shape[0]*vidar), self.input_shape[1]))
114
115            # Use model to predict
116            inputs = [image.img_to_array(rgb)]
117            tmp_inp = np.array(inputs)
118            x = preprocess_input(tmp_inp)
119
120            y = self.model.predict(x)
121
122
123            # This line creates a new TensorFlow device every time. Is there a
124            # way to avoid that?
125            results = self.bbox_util.detection_out(y)
126
127            if len(results) > 0 and len(results[0]) > 0:
128                # Interpret output, only one frame is used
129                det_label = results[0][:, 0]
130                det_conf = results[0][:, 1]
131                det_xmin = results[0][:, 2]
132                det_ymin = results[0][:, 3]
133                det_xmax = results[0][:, 4]
134                det_ymax = results[0][:, 5]
135
136                top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh]
137
138                top_conf = det_conf[top_indices]
139                top_label_indices = det_label[top_indices].tolist()
140                top_xmin = det_xmin[top_indices]
141                top_ymin = det_ymin[top_indices]
142                top_xmax = det_xmax[top_indices]
143                top_ymax = det_ymax[top_indices]
144
145                for i in range(top_conf.shape[0]):
146                    xmin = int(round(top_xmin[i] * to_draw.shape[1]))
147                    ymin = int(round(top_ymin[i] * to_draw.shape[0]))
148                    xmax = int(round(top_xmax[i] * to_draw.shape[1]))
149                    ymax = int(round(top_ymax[i] * to_draw.shape[0]))
150
151                    # Draw the box on top of the to_draw image
152                    class_num = int(top_label_indices[i])
153                    cv2.rectangle(to_draw, (xmin, ymin), (xmax, ymax),
154                                  self.class_colors[class_num], 2)
155                    text = self.class_names[class_num] + " " + ('%.2f' % top_conf[i])
156
157                    text_top = (xmin, ymin-10)
158                    text_bot = (xmin + 80, ymin + 5)
159                    text_pos = (xmin + 5, ymin)
160                    cv2.rectangle(to_draw, text_top, text_bot, self.class_colors[class_num], -1)
161                    cv2.putText(to_draw, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1)
162
163            # Calculate FPS
164            # This computes FPS for everything, not just the model's execution
165            # which may or may not be what you want
166            curr_time = timer()
167            exec_time = curr_time - prev_time
168            prev_time = curr_time
169            accum_time = accum_time + exec_time
170            curr_fps = curr_fps + 1
171            if accum_time > 1:
172                accum_time = accum_time - 1
173                fps = "FPS: " + str(curr_fps)
174                curr_fps = 0
175
176            # Draw FPS in top left corner
177            cv2.rectangle(to_draw, (0,0), (50, 17), (255,255,255), -1)
178            cv2.putText(to_draw, fps, (3,10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1)
179
180            cv2.imshow("SSD result", to_draw)
181            cv2.waitKey(10)
182
183            print(text)
184            cv2.imwrite("frame_" + str('{0:04d}'.format(num_frame)) +".png", to_draw)
185
186            num_frame += 1
187