dlibのdetectorでどういう処理をされているのか知りたいです。

前提・実現したいこと

単眼カメラで取得した映像中の顔を認識して顔の周りに枠をつけるプログラムを作成しています。

発生している問題・エラーメッセージ

一人だけ写すときはちゃんと顔枠もつくのですが、複数人を認識させようとすると一瞬だけ複数の顔枠が出現してその後一人だけに顔枠がついている状態になります。

エラーメッセージ

該当のソースコード

//カメラに写っている顔をdlibで識別
//連続で顔を検出する場合検出範囲を絞って高速化
//顔検出していないときは数フレームごとに顔認識をすることで高速化（顔はすぐに検出されない）

//複数の顔のマーキング(一瞬だけ複数マークした後最初に認識した顔だけになる(座標が(５３、４５)付近になる))


#include <opencv2/opencv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <dlib/opencv.h>
#include <dlib/image_io.h>
#include <dlib/image_transforms.h>
#include <dlib/image_processing/frontal_face_detector.h>
#include <dlib/image_processing/render_face_detections.h>
#include <dlib/image_processing.h>
#include <dlib/gui_widgets.h>
#include <omp.h>

using namespace dlib;
using namespace std;
using namespace cv;

#define DETECTION_PROCESS (45) //顔が検出されていないときに何フレームおきに顔検出をするか(90フレーム＝１秒)


static cv::Rect dlibRectangleToOpenCV(dlib::rectangle r)
{
	return cv::Rect(cv::Point2i(r.left(), r.top()), cv::Point2i(r.right() + 1, r.bottom() + 1));
}


int main()
{
    try
    {
			cv::VideoCapture cap(0);
      if (!cap.isOpened())
      {
          cerr << "Unable to connect to camera" << endl;
          return 1;
      }

      cv::Mat frame, detection_frame;
      cv::Rect roi;
			//std::vector<dlib::rectangle> faces;

      int detection_flag = 0; //直前に顔を検出したか(0:してない　1:した)
			int basic_flag = 0;//連続で顔を検知しているかフラグ(0:いいえ(初めての検知)　1:はい(2連続以上の検知))
			int not_found_flag = 1;//連続顔を見つけられなかったフラグ(0:いいえ(見つかった)　1:はい(見つからなかった))

			std::vector<int> x = {0};//顔座標の左上のx座標
      std::vector<int> y = {0};//顔座標の左上のy座標
      std::vector<int> x_end = {0};//顔座標の右下のx座標
      std::vector<int> y_end = {0};//顔座標の右下のy座標

      std::vector<int> x_basic = {0};//基準点のX座標
      std::vector<int> y_basic = {0};//基準点のY座標

			int frame_count = 0; //顔検出していないフレームのカウント


      frontal_face_detector detector = get_frontal_face_detector(); //顔検出機を呼び出す
      //shape_predictor pose_model;
      //deserialize("shape_predictor_68_face_landmarks.dat") >> pose_model;

      while(cap.read(frame))
      {
				//直前のフレームで顔が検出されていない場合
      	if (detection_flag == 0) {

      			//検出範囲はカメラ映像全体とする
        		detection_frame = frame;

        		//基準点をリセット
        		basic_flag = 0;
        		x_basic = {0};
        		y_basic = {0};

        }
        else {//直前のフレームで顔が検出された場合

					#pragma omp parallel for
					for (int i = 0; i < x.size(); i++) {

						//検出範囲がキャプチャフレーム内に収まるように変換する
        		if (x[i] - 50 < 1) {
        			x[i] = 51;
        		}
        		if (y[i] - 50 < 1) {
        			y[i] = 51;
        		}
        		if (x_end[i] + 50 > frame.cols - 1) {
        			x_end[i] = frame.cols - 51;
        		}
        		if (y_end[i] + 50 > frame.rows - 1) {
        			y_end[i] = frame.rows - 51;
        		}

        		//検出範囲として、直前のフレームの顔検出の範囲より一回り(上下左右50pixel)大きい範囲とする
        		cv::Rect roi(cv::Point(x[i] - 50, y[i] - 50), cv::Point(x_end[i] + 50, y_end[i] + 50));
        		detection_frame = frame(roi);

        		//検出範囲をピンク枠で囲う
        		//cv::rectangle(frame, cv::Point(x[i] - 50, y[i] - 50), cv::Point(x_end[i] + 50, y_end[i] + 50), Scalar(200, 0, 255), 3);
					}

        	//連続検索フラグを1(2連続以上の)
      		basic_flag = 1;
        }

        detection_flag = 0;


				if (not_found_flag == 0 || (not_found_flag == 1 && frame_count == DETECTION_PROCESS) ) {

					cv_image<bgr_pixel> cimg(detection_frame);
					std::vector<dlib::rectangle> faces = detector(cimg); //顔検出

					for (int i = 0; i < faces.size(); i++) {
						cout << i << " " << faces[i].left() << ", " << faces[i].top() << endl;
					}


	        //顔を検出した場合
	        if (faces.size() > 0) {
	        	//顔の検出フラグを1(発見)にする
	        	detection_flag = 1;

	        	//連続顔を見つけられなかったフラグを0
	        	not_found_flag = 0;

	        	//顔座標の左上の座標を求める
	        	if (basic_flag == 0) {//初検知の場合

							#pragma omp parallel for
							for (int i = 0; i < faces.size(); i++) {
								//初検知の場合は検出された値をそのまま使う
		        		x[i] = faces[i].left();
		        		y[i] = faces[i].top();
							}

	        	}
	        	else if (basic_flag == 1) {//連続検知の場合

							#pragma omp parallel for
							for (int i = 0; i < faces.size(); i++) {

								//連続検知の場合は、検出座標と直前の基準点を使って顔座標を検出する
		        		//(x_basic - 50)：カメラキャプチャ全体の座標から見た検出範囲の左上の座標(ピンク枠の左上)
		        		//rect.x：切り出したフレーム(ピンク枠内)から見た顔の左上の座標(赤枠の左上)

		        		x[i] = (x_basic[i] - 50) + faces[i].left();
		        		y[i] = (y_basic[i] - 50) + faces[i].top();

							}
	        	}

						#pragma omp parallel for
						for (int i = 0; i < faces.size(); i++) {

							//顔座標の右下の座標を求める
							x_end[i] = x[i] + (faces[i].right() - faces[i].left()) + 1;
							y_end[i] = y[i] + (faces[i].bottom() - faces[i].top()) + 1;

							//基準点を今算出した顔座標に更新する
							x_basic[i] = x[i];
							y_basic[i] = y[i];

							cv::rectangle(frame, cv::Point(x[i], y[i]), cv::Point(x_end[i], y_end[i]), cv::Scalar(0, 0, 255), 3);
						}

	        }
					else {
						not_found_flag = 1;
						frame_count = 0;
					}

				} //if (not_found_flag == 0 || (not_found_flag == 1 && frame_count == DETECTION_PROCESS) ) END

				else {
					frame_count++;
				}


        cv::imshow("camera", frame);

        const int key = cv::waitKey(1);

        // //zでズームイン
        // if (key == 'z') {
        //   zoom += zoomRate;
        // }
        // //xでズームアウト
        // else if (key == 'x') {
        //   zoom -= zoomRate;
        //   if(zoom < 1) zoom = 1;
        // }
        //sボタンでスクリーンショット
        if (key == 's') {
          std::string name;

          std::cout << "file name?" << endl;
          std::getline(std::cin, name);

          cv::imwrite(name, frame);
        }
        //qボタンが押されたとき
        else if(key == 'q') {
          break;
        }

      } //while(cap.read(frame)) END

    }
    catch(serialization_error& e)
    {
        cout << "You need dlib's default face landmarking model file to run this example." << endl;
        cout << "You can get it from the following URL: " << endl;
        cout << "   http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2" << endl;
        cout << endl << e.what() << endl;
    }
    catch(exception& e)
    {
        cout << e.what() << endl;
    }
}