3回目の認識でカメラがフリーズしてしまう

前提・実現したいこと

ubuntu20.04
python3.8
RTX 3060
基本的には、executor.submit(m.process)が起動され、必要に応じてexecutor.submit(i.text_reading)が起動しています。
（main関数より）
ここで質問ですが、１、2回目ともに必要に応じてexecutor.submit(i.text_reading)が起動して処理をしてくれますが、3回目になるとexecutor.submit(i.text_reading)が起動するとカメラがフリーズする形となってしまいます。

「 nvidia-smi --query-gpu=timestamp,name,utilization.gpu,memory.used --format=csv -l 1」
こちらのコマンドでgpu使用率を確認しても40％ほどしか使っていません。

なぜこのようなことが起きるのかご教示いただけないでしょうか。
よろしくお願い致します。

該当のソースコード

confThreshold = 0.5
nmsThreshold = 0.4
#coco.namesのpath
classesFile = "/home/limlab/program/data/coco.names"

#検出した際に保存され人数検出に使われる画像のpath
count_image_path ='/image/image1.png'
detect_image_path ='/image/image2.png'
encoder_path ='/data/encoder-2-1000.ckpt'
decoder_path ='/decoder-2-1000.ckpt'
vocab_path ='/data/vocab.pkl'
count_image_path ='/image/image1.png'


class Main:
    
    def __init__(self):    
        self.j = 0
        self.y = 0
    def process(self):
        args = self.arg_parse()
        confidence = float(args.confidence)
        nms_thesh = float(args.nms_thresh)
        CUDA = torch.cuda.is_available()
        num_classes = 80
        model = Darknet(args.cfgfile)
        model.load_weights(args.weightsfile)
        print("Network successfully loaded")
        if CUDA:
            model.cuda()
        config = rs.config()
        config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)
        config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
        pipeline = rs.pipeline()
        profile = pipeline.start(config)
        align = rs.align(rs.stream.color)
       
        while True:
            frames = pipeline.wait_for_frames()
            color_frame = frames.get_color_frame()
            #print(type(color_frame))
            depth_frame = frames.get_depth_frame()
            if not depth_frame or not color_frame:
                continue
            frames = align.process(frames)
            profile = frames.get_profile()  
            color_image = np.asanyarray(color_frame.get_data())
            #color_image1 = np.asanyarray(color_frame.get_data())
            #print(type(color_image))
            depth_color_frame = rs.colorizer().colorize(depth_frame)
            depth_image = np.asanyarray(depth_color_frame.get_data())
            

            if frames:
                inp_dim = int(model.net_info["height"])
                model.eval()
                img, orig_im, dim = self.prep_image(color_image, inp_dim)
                im_dim = torch.FloatTensor(dim).repeat(1,2)        
                            
                if CUDA:
                    im_dim = im_dim.cuda()            
                    img = img.cuda()                                       
                with torch.no_grad():               
                    output = model(Variable(img), CUDA)   
                    
                output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh)
                im_dim = im_dim.repeat(output.size(0), 1)
                scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1)        
                output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2
                output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2   
                output[:,1:5] /= scaling_factor
                for i in range(output.shape[0]):
                    output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0])
                    output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1])
        
                list(map(lambda x: self.write(x, orig_im,depth_frame,color_image), output))    
        
                cv2.imshow('Video', color_image)
                cv2.moveWindow('Video', 2825, 380)
                
            if cv2.waitKey(1) & 0xFF == ord('q'):
                sys.exit()
  
    def prep_image(self,img, inp_dim): 
        orig_im = img
        dim = orig_im.shape[1], orig_im.shape[0]
        img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
        img_ = img[:,:,::-1].transpose((2,0,1)).copy()
        img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
        return img_, orig_im, dim

    def write(self,x, img,depth_frame,color_image):
        classes = load_classes(classesFile)
        color = (255,0,0)
        c1 = tuple(x[1:3].int())
        c2 = tuple(x[3:5].int())
        cls = int(x[-1])
        label = "{0}".format(classes[cls])
        distance = depth_frame.get_distance((c1[0]+c2[0])/2,(c1[1]+c2[1])/2)
        if distance < 20:
            if label == 'person':
                self.j += 1
                if  self.j == 10: 
                    cv2.imwrite(count_image_path,color_image)
                    if self.j  >= 10:
                        self.j = 0
                        if self.j >=0:
                            pass
        return img
            
    def arg_parse(self):
        #文字数制限のため省略

class Image_captioning:   
# Device configuration
    def load_image(self,color_image_path, transform=None):    
        image = Image.open(color_image_path)
        image = image.resize([224, 224], Image.LANCZOS)
        if transform is not None:
            image = transform(image).unsqueeze(0)
        
        return image

    def text_reading(self):
        print("text")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Model parameters (should be same as paramters in train.py)
        embed_size=256
        hidden_size=512
        num_layers=1
        import glob
        print("glob前")
        files = sorted(glob.glob(count_image_path))
        print(files)
        for i, image_path in enumerate (files):
            
            # Image preprocessing
            transform = transforms.Compose([
                transforms.ToTensor(), 
                transforms.Normalize((0.485, 0.456, 0.406), 
                                    (0.229, 0.224, 0.225))])
            # Load vocabulary wrapper
            with open(vocab_path, 'rb') as f:
                vocab = pickle.load(f)
           
            # Build models
            encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
            decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
            encoder = encoder.to(device)
            decoder = decoder.to(device)
        
            # Load the trained model parameters
            encoder.load_state_dict(torch.load(encoder_path))
            decoder.load_state_dict(torch.load(decoder_path))
       
            # Prepare an image
            image = self.load_image(image_path, transform)
            image_tensor = image.to(device)
          
            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
         
            # Convert word_ids to words
            sampled_caption = []
            print("f")
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)
            
            image = Image.open(image_path)
            description = sentence.replace('<start>',' ',1).replace('<end>',' ',1)
            print (description)
            os.remove(count_image_path)                  
def main():
    if os.path.exists(count_image_path):
        print("count_image_pathが存在するので削除します")
        os.remove(count_image_path) 
        
    m = Main()
    i = Image_captioning()
    executor = concurrent.futures.ProcessPoolExecutor()
    executor.submit(m.process)
    r_running = False

    while True:
        if not r_running and os.path.exists(count_image_path)== True:
            executor.submit(i.text_reading)
            r_running = True
            print(r_running)
            
        if not os.path.exists(count_image_path):
            r_running = False
        
if __name__ == '__main__':
    main()