CUDARuntimeError: cudaErrorIllegalAddress: an illegal memory access was encounteredの解決

Cupyを用いて大量の行列を並列で操作するプログラムを作っています。
行列の数N=1000ではエラーが生じないのですが、N=10000では以下のエラーが生じます。

CUDARuntimeError: cudaErrorIllegalAddress: an illegal memory access was encountered

ただメモリが不足していないことは、ELSA System Graphで確認済みです。

N=10000の時の出力は以下の通りです。
runfile('C:/data&prog/koga/repository/HL/Hamiltonian_Learning/subpy/jacobi_givens_rotation02.py', wdir='C:/data&prog/koga/repository/HL/Hamiltonian_Learning/subpy')
calcuration time:1.317464828491211s
Traceback (most recent call last):

File "<ipython-input-1-7c551e7d9b58>", line 1, in <module>
runfile('C:/data&prog/koga/repository/HL/Hamiltonian_Learning/subpy/jacobi_givens_rotation02.py', wdir='C:/data&prog/koga/repository/HL/Hamiltonian_Learning/subpy')

File "C:\Users\a319b\Anaconda3\envs\HL1_copy\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)

File "C:\Users\a319b\Anaconda3\envs\HL1_copy\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)

File "C:/data&prog/koga/repository/HL/Hamiltonian_Learning/subpy/jacobi_givens_rotation02.py", line 121, in <module>
Z_numpy = Z.get()

File "cupy\core\core.pyx", line 1801, in cupy.core.core.ndarray.get

File "cupy\core\core.pyx", line 1825, in cupy.core.core.ndarray.get

File "cupy\cuda\memory.pyx", line 421, in cupy.cuda.memory.MemoryPointer.copy_to_host

File "cupy\cuda\runtime.pyx", line 257, in cupy.cuda.runtime.memcpy

File "cupy\cuda\runtime.pyx", line 137, in cupy.cuda.runtime.check_status

CUDARuntimeError: cudaErrorIllegalAddress: an illegal memory access was encountered

スペックは以下の通りです。

CPU:Intel(R) Core(TM) i9-7900X CPU @ 3.30GHz 3.31GHz
GPU:Nvidia GeForce GTX 1080

バージョンは以下の通りです。
Python 3.6.4 :: Anaconda, Inc.
cupy-cuda90 5.2.0
numpy 1.16.0

実行コードは以下の通りです。

python3
1
2import numpy as np
3import cupy as cp
4import time
5
6def make_index(N, n):
7    
8    index_list = []
9    
10    for i in range(N):
11        p = np.random.randint(1,n-2)
12        q = p - 1
13        
14        temp_index = [p, q]
15        
16        index_list.append(temp_index)
17        
18    return cp.array(index_list)
19
20def calc_givens_theta(X, ind):
21    """
22    #各行列のギブンス回転角を求める
23    """
24    ind = cp.asnumpy(ind)
25    N = X.shape[0]
26    theta_list = np.ones([N,1]) * np.pi/4
27    
28    for i in range(N):
29        p = ind[i][0]
30        q = ind[i][1]
31        if X[i,p,p] == X[i,q,q]:
32            theta_list[i] = np.tan(np.pi/2)
33        else:
34            theta_list[i] = 2*X[i,p,q]/(X[i,p,p] - X[i,q,q])
35            
36    theta_list = 0.5 * np.arctan(theta_list)
37        
38    return cp.array(theta_list)
39    
40
41N = 10000
42n = 18
43X = cp.ones([N, n, n]).astype(cp.float32)
44Z = cp.ones([N, n, n]).astype(cp.float32)
45
46t1 = time.time()
47
48index = make_index(N, n).astype(cp.int16) #[p, q]の組合せをランダムに発生させる
49
50theta = calc_givens_theta(X, index) #ギブンス回転角を求める
51theta = theta.astype(cp.float32)
52theta = theta * 4/3
53
54sin = cp.sin(theta)
55sin2 = cp.sin(2*theta)
56cos = cp.cos(theta)
57cos2= cp.cos(2*theta)
58
59get_index_kernel_2d = cp.ElementwiseKernel(
60        in_params='raw float32 x, int16 n,\
61        raw int16 ind, raw float32 sin, raw float32 cos,\
62        raw float32 sin2, raw float32 cos2',
63        out_params='float32 z',
64        operation=\
65        '''
66        int nx = 0; //行列番号
67        
68        int temp_i = i;
69        
70        while (temp_i > (n*n-1)) { // 行列の指標に直す 0~行列サイズ**2-1
71            temp_i = temp_i - n*n;
72            nx = nx + 2;
73        }
74        
75        int x_idx = temp_i%n; //x座標
76        int y_idx = temp_i/n; //y座標
77        
78        int p = ind[nx];
79        int q = ind[nx+1];
80        
81        if (x_idx == p) { 
82            if (y_idx == q){
83                z = 0;
84            } else if(y_idx == p) {
85                float xpp = x[n*n*nx+n*p+p]; // X[nx,p,p]
86                float xpq = x[n*n*nx+n*p+q]; // X[nx,p,q]
87                z = (xpp+xpq)/2 + (xpp-xpq)*cos2[nx/2]/2 - xpq*sin2[nx/2];
88            } else {
89                int k = x_idx; 
90                float xpk = x[n*n*nx+n*p+k]; // X[nx,p,k]
91                float xqk = x[n*n*nx+n*q+k]; // X[nx,q,k]
92                z = xpk*cos[nx/2] - xqk*sin[nx/2];
93            }
94        } else if (x_idx == q) {
95            if (y_idx == p) {
96                z = 0;
97            } else if (y_idx == q) { 
98                float xpp = x[n*n*nx+n*p+p]; // X[nx,p,p]
99                float xpq = x[n*n*nx+n*q+p]; // X[nx,p,q]
100                z = (xpp+xpq)/2 + (xpp-xpq)*cos2[nx/2]/2 + xpq*sin2[nx/2];
101            } else {
102                int k = x_idx; 
103                float xpk = x[n*n*nx+n*p+k]; // X[nx,p,k]
104                float xqk = x[n*n*nx+n*q+k]; // X[nx,q,k]
105                z = xpk*cos[nx/2] + xqk*sin[nx/2];
106            }
107        }
108        ''',
109        name='get_index_kernel_2d')
110
111get_index_kernel_2d(X, X.shape[1], index, sin, cos, sin2, cos2, Z)
112t2 = time.time()
113print("calcuration time:{}s".format(t2 - t1))
114
115# 表示のためにnumpyに変換
116Z_numpy = Z.get()
117#np.set_printoptions(formatter={'int': '{:02d}'.format})
118print(Z_numpy)
119