前提

プログラムの高速化のために，SoA型配列を作成したいと考えています．そこで，イテレータパターンをまねて，以下のようなクラスを作成しました．

c++
1/**
2 * テンソルクラス(iterator))
3 */
4template <class Type, int Rows, int Cols>
5class TensorBase
6{
7public:
8
9    /**
10     * Listクラスのためのコンストラクタ
11     */
12    inline TensorBase(Type *rhs[Rows * Cols])
13    {
14        Type **out = m_ref;
15        std::memcpy( out, rhs, sizeof(Type) * Rows * Cols);
16    }
17
18    /**
19     * デフォルトコンストラクタ
20     */
21    inline TensorBase()
22    {
23        Type **out = m_ref;
24        for (int col = 0; col < Cols; col++)
25        {
26            for (int row = 0; row < Rows; row++)
27            {
28                out[row + Rows * col] = new Type;
29            }
30        }
31    }
32
33    /*
34    *コピーコンストラクタ
35    */
36    inline void operator=(const TensorBase &rhs)
37    {
38        Type **out = m_ref;
39        for (int col = 0; col < Cols; col++)
40        {
41            for (int row = 0; row < Rows; row++)
42            {
43                *(out[row + Rows * col]) = *(rhs.m_ref[row + Rows * col]);
44            }
45        }
46    }
47
48    /**
49    * 配列のための等号演算子
50    */
51    inline void operator=(const Type (&rhs)[Rows * Cols])
52    {
53        Type **out = m_ref;
54        for (int col = 0; col < Cols; col++)
55        {
56            for (int row = 0; row < Rows; row++)
57            {
58                *(out[row + Rows * col]) = rhs[row + Rows * col];
59            }
60        }
61    }
62
63private:
64    Type *m_ref[Rows * Cols];	//テンソルの参照配列
65};
66
67/**
68 * SoAクラス(Aggregate)
69 */
70template <class Type, int Rows, int Cols>
71class List
72{
73public:
74    using Tensor = TensorBase<Type, Rows, Cols>;
75
76    /**
77    * コンストラクタ
78    */
79    inline List(int _size) : size(_size)
80    {
81        for (int col = 0; col < Cols; col++)
82        {
83            for (int row = 0; row < Rows; row++)
84            {
85                m_data[row + Rows * col] = new double[size];
86            }
87        }
88    }
89
90    /**
91    * i番目の配列の要素にアクセス
92    */
93    inline Tensor operator[](int i)
94    {
95        Type **tmp = m_data;
96        Type *m_ref[Rows * Cols];
97        for (int col = 0; col < Cols; col++)
98        {
99            for (int row = 0; row < Rows; row++)
100            {
101                m_ref[row + Rows * col] = tmp[row + Rows * col] + i;
102            }
103        }
104        return Tensor(m_ref);
105    }
106
107private:
108    int size; 					//テンソルのサイズ
109    Type *m_data[Rows * Cols];	//テンソルのデータ配列
110};

TensorBaseクラス(iterator側)はRows列×Cols行のテンソルを格納するためのクラスになっています．このテンソルクラスのSoA型配列を保存しておくために，Listクラス(Aggregate側)を作成しました．
実際のデータはList::m_dataに格納されていて，取得したい配列のインデックスに一致する要素をTensorBaseのコンストラクタに渡し，TensorBase:m_refから参照できるようにしています．

実現したいこと

これらのクラスを使って，理想どおり動きました．しかし，以下のようにTensorBaseクラスを作成してListクラスにコピーすると，とても時間がかかってしまいます．
なぜ時間がかかってしまうのかわからず，かかる時間を短くする方法を教えていただきたいです，

c++
1int main(void)
2{
3
4    int size = 10000;
5
6    //SoA型配列
7    List<double, 3, 1> list(size);
8
9	//テンソルクラス
10	TensorBase<double, 3, 1> tensor;  
11
12	for (int i = 0; i < size; i++)
13	{
14		/**
15		 * リストに追加する
16		 */
17		tensor = {i, i * 2, i * 4};
18		list[i] = tensor;
19	}
20
21    return 0;
22}

試したこと

時間を評価するために，①TensorBaseクラス・Listクラスを使って計算 ②Rows×Cols個のバラバラの配列を使って計算　の2つを計算時間を比較しました．結果，①の場合は②の場合より2倍以上時間がかかっていました．

以下のように，TensorBaseクラスを経由せずに直接Listに代入すると，①，②の計算時間はほとんど同じになりました．

c++
1int main(void)
2{
3
4    int size = 10000;
5
6    //SoA型配列
7    List<double, 3, 1> list(size);
8
9	for (int i = 0; i < size; i++)
10	{
11		list[i] = {i, i * 2, i * 4};//直接代入
12    }
13
14    return 0;
15}

そこで，TensorBaseクラスの等号演算子のオーバーロードが遅いのだと考え，デフォルトコンストラクタからTensorBase::m_refをnewするときに連続したアドレスに配置されるようにメモリを確保したり，forループをすべて展開してみましたが，ほとんど計算時間が変わりませんでした．

コード全文

c++
1#include <iostream>
2#include <cstring> 
3
4/**
5 * テンソルクラス(iterator))
6 */
7template <class Type, int Rows, int Cols>
8class TensorBase
9{
10public:
11
12    /**
13     * Listクラスのためのコンストラクタ
14     */
15    inline TensorBase(Type *rhs[Rows * Cols])
16    {
17        Type **out = m_ref;
18        std::memcpy( out, rhs, sizeof(Type) * Rows * Cols);
19    }
20
21    /**
22     * コンストラクタ
23     */
24    inline TensorBase()
25    {
26        Type **out = m_ref;
27        for (int col = 0; col < Cols; col++)
28        {
29            for (int row = 0; row < Rows; row++)
30            {
31                out[row + Rows * col] = new Type;
32            }
33        }
34    }
35
36    inline void operator=(const TensorBase &rhs)
37    {
38        Type **out = m_ref;
39        for (int col = 0; col < Cols; col++)
40        {
41            for (int row = 0; row < Rows; row++)
42            {
43                *(out[row + Rows * col]) = *(rhs.m_ref[row + Rows * col]);
44            }
45        }
46    }
47
48    inline void operator=(const Type (&rhs)[Rows * Cols])
49    {
50        Type **out = m_ref;
51        for (int col = 0; col < Cols; col++)
52        {
53            for (int row = 0; row < Rows; row++)
54            {
55                *(out[row + Rows * col]) = rhs[row + Rows * col];
56            }
57        }
58    }
59
60private:
61    Type *m_ref[Rows * Cols];	//テンソルの参照配列
62};
63
64/**
65 * SoAクラス(Aggregate)
66 */
67template <class Type, int Rows, int Cols>
68class List
69{
70public:
71    using Tensor = TensorBase<Type, Rows, Cols>;
72
73    inline List(int _size) : size(_size)
74    {
75        for (int col = 0; col < Cols; col++)
76        {
77            for (int row = 0; row < Rows; row++)
78            {
79                m_data[row + Rows * col] = new double[size];
80            }
81        }
82    }
83
84    inline Tensor operator[](int i)
85    {
86        Type **tmp = m_data;
87        Type *m_ref[Rows * Cols];
88        for (int col = 0; col < Cols; col++)
89        {
90            for (int row = 0; row < Rows; row++)
91            {
92                m_ref[row + Rows * col] = tmp[row + Rows * col] + i;
93            }
94        }
95        return Tensor(m_ref);
96    }
97
98private:
99    int size; 					//テンソルのサイズ
100    Type *m_data[Rows * Cols];	//テンソルのデータ配列
101};
102
103int main(void)
104{
105
106    int size = 10000;
107
108    //SoA型配列
109    List<double, 3, 1> list(size);
110
111	//テンソルクラス
112	TensorBase<double, 3, 1> tensor;  
113
114	for (int i = 0; i < size; i++)
115	{
116		/**
117		 * リストに追加する
118		 */
119		tensor = {i, i * 2, i * 4};
120		list[i] = tensor;
121	}
122
123    return 0;
124}
125

行動規範の内容に同意します

回答1件

ベストアンサー

二倍程度の差であれば、operator=が二度呼ばれてることが原因だと思います。

C++
1        tensor = {i, i * 2, i * 4}; //一度目
2        list[i] = tensor; //二度目

C++
1　　　　list[i] = {i, i * 2, i * 4}; //一度だけ

投稿2019/11/01 07:23

yudedako67

総合スコア2047

otyaken

2019/11/01 09:47 編集

回答ありがとうございます．それぞれ上記の例を10000回繰り返した処理時間なのですが， ```c++ tensor = {i, i * 2, i * 4}; //0.11116 s list[i] = tensor; //0.09417 s list[i] = {i, i * 2, i * 4}; //0.0806805 s ``` となり，2倍以上遅くなっています．このため，オペレータが原因ではないと考えていました．また，`tensor = {i, i * 2, i * 4};`と`list[i] = {i, i * 2, i * 4};`は同じオペレータを呼び出していると思うのですが，なぜこのような差が出るのかがわからないです．

yudedako67

2019/11/01 12:43

その計測結果については試行回数が少ないので他の要因が大きく影響してるだけだと思います。少なくとも手元の環境だと、 tensor = {i, i * 2, i * 4}; list[i] = tensor; この二つの代入での実行時間の差はおおむね5%以内に収まってるので、代入が二回行われてることのほかは目立ったボトルネックはありません。

otyaken

2019/11/01 22:08

回答ありがとうございます．計算回数を大きくして繰り返してみると，2倍以内に収まるようになりました．また，実行ごとに結果がかなり変わっていて，プログラム以外に原因があったようです．

行動規範の内容に同意します