質問編集履歴

2

誤字の修正と実行情報の追加

2022/03/24 22:19

投稿

gejisaki
gejisaki

スコア35

test CHANGED
File without changes
test CHANGED
@@ -23,7 +23,8 @@
23
23
  <円周率計算プログラムの情報>
24
24
 
25
25
  実行コマンド:
26
- $mpirun --hostfile host64 -n 14 python3 pi_chudnovsky.py 12 : --hostfile host32 -n 2 python3 pi_chudnovsky.py 12
26
+ $mpirun --hostfile host64 -n 14 python3 pi_chudnovsky.py 13 : --hostfile host32 -n 2 python3 pi_chudnovsky.py 13
27
+ ↑ちなみに、13のところを12以下にするとエラーなく実行できます。
27
28
 
28
29
  pi_chudnovsky.pyについて:
29
30
  https://github.com/kemusiro/openmpi-sample/からダウンロード

1

ソースコード及びエラーを本文に記載、プログラム名等の微修正

2022/03/24 22:16

投稿

gejisaki
gejisaki

スコア35

test CHANGED
File without changes
test CHANGED
@@ -6,6 +6,7 @@
6
6
  にて、 いわゆるハローワールドが動作するところまでは確認できました。
7
7
 
8
8
  次に円周率を計算しようと思ったのですが、どうもcomm.sendとcomm.recvにおけるnp.array(dtype=object)のやりとりに起因してエラーが発生しているようです。
9
+ クラスタの構造は2分木転送方式であり、broadcastやreduceは使っていません。
9
10
 
10
11
  ちなみに、hostlistに64bitPCのみ、32bitPCのみを指定した場合は円周率がうまく計算できるので、
11
12
  32bitノードと64bitノードがデータをやりとりする時に問題が起こっている可能性が高いと思います。
@@ -16,3 +17,206 @@
16
17
 
17
18
  ご教示いただけますと幸いです。
18
19
  よろしくお願いいたします。
20
+
21
+
22
+
23
+ <円周率計算プログラムの情報>
24
+
25
+ 実行コマンド:
26
+ $mpirun --hostfile host64 -n 14 python3 pi_chudnovsky.py 12 : --hostfile host32 -n 2 python3 pi_chudnovsky.py 12
27
+
28
+ pi_chudnovsky.pyについて:
29
+ https://github.com/kemusiro/openmpi-sample/からダウンロード
30
+ ライセンス
31
+ MIT License
32
+
33
+ Copyright (c) 2021 Kenichi Miyata
34
+
35
+ Permission is hereby granted, free of charge, to any person obtaining a copy
36
+ of this software and associated documentation files (the "Software"), to deal
37
+ in the Software without restriction, including without limitation the rights
38
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
39
+ copies of the Software, and to permit persons to whom the Software is
40
+ furnished to do so, subject to the following conditions:
41
+
42
+ The above copyright notice and this permission notice shall be included in all
43
+ copies or substantial portions of the Software.
44
+
45
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
50
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
51
+ SOFTWARE.
52
+
53
+ コード本文
54
+
55
+ ```python 3.x
56
+ from mpi4py import MPI
57
+ import gmpy2 as mp
58
+ from gmpy2 import mpz
59
+ import numpy as np
60
+ import mmap
61
+ import sys
62
+ import os
63
+
64
+ # 円周率を並列に計算する。
65
+ def calc_PQT_root(n):
66
+ # まず自分の割り当て分を計算する。
67
+ alloc = int(n / size)
68
+ PQT2 = calc_PQT_local(rank * alloc, (rank + 1) * alloc)
69
+
70
+ # 各ノードの計算結果をマージする。
71
+ level = int(mp.ceil(mp.log2(size)))
72
+ k = 1
73
+ for _ in range(level):
74
+ if (rank & k) == k:
75
+ PQT1 = comm.recv(source=rank-k, tag=0)
76
+ PQT2 = np.array([PQT1[0] * PQT2[0],
77
+ PQT1[1] * PQT2[1],
78
+ PQT1[2] * PQT2[1] + PQT1[0] * PQT2[2]])
79
+ k *= 2
80
+ else:
81
+ comm.send(PQT2, dest=rank+k, tag=0)
82
+ break
83
+ return PQT2
84
+
85
+ # ノード内でP(n1, n2), Q(n1, n2), T(n1, n2)を計算する。
86
+ def calc_PQT_local(n1, n2):
87
+ if n1 + 1 == n2:
88
+ P = mpz((-1) * (2 * n2 - 1) * (6 * n2 - 5) * (6 * n2 - 1))
89
+ return np.array([P, C3over24 * n2 ** 3, (A + B * n2) * P])
90
+ else:
91
+ m = int((n1 + n2) / 2)
92
+ PQT1 = calc_PQT_local(n1, m)
93
+ PQT2 = calc_PQT_local(m, n2)
94
+ return np.array([PQT1[0] * PQT2[0],
95
+ PQT1[1] * PQT2[1],
96
+ PQT1[2] * PQT2[1] + PQT1[0] * PQT2[2]])
97
+
98
+ # 円周率が何桁まで一致するかを判定する。
99
+ def check_pi(outfile, pifile):
100
+ with open(pifile, 'rb') as f0, \
101
+ open(outfile, 'rb') as f1:
102
+ # 2つのファイルをメモリにマップして比較する。
103
+ with mmap.mmap(f0.fileno(), 0, flags=mmap.MAP_PRIVATE) as mm0, \
104
+ mmap.mmap(f1.fileno(), 0, flags=mmap.MAP_PRIVATE) as mm1:
105
+ index = mpz(0)
106
+ length = 1024 * 1024 # 一度にチェックする桁数
107
+ found = False
108
+ while not found:
109
+ ans = mm0.read(length)
110
+ calc = mm1.read(length)
111
+ if ans != calc:
112
+ maxlen = min(len(ans), len(calc))
113
+ for i in range(maxlen):
114
+ if ans[i] != calc[i]:
115
+ n = index + i - 2
116
+ found = True
117
+ break
118
+ else:
119
+ n = index + maxlen - 3
120
+ break
121
+ else:
122
+ index += length
123
+ return n
124
+
125
+ if __name__ == '__main__':
126
+ comm = MPI.COMM_WORLD
127
+ rank = comm.Get_rank()
128
+ size = comm.Get_size()
129
+
130
+ if len(sys.argv) < 2:
131
+ if rank == size - 1:
132
+ print('python3 pi_chudnovsky.py <power> [pifile]')
133
+ sys.exit(0)
134
+
135
+ if not sys.argv[1].isdecimal():
136
+ if rank == size - 1:
137
+ print('not decimal value: {}'.format(sys.argv[1]))
138
+ print('python3 pi_chudnovsky.py power [pifile]')
139
+ sys.exit(0)
140
+ power = int(sys.argv[1])
141
+
142
+ if len(sys.argv) == 2:
143
+ pifile = '/share/common/pi-10oku.txt'
144
+ else:
145
+ pifile = sys.argv[2]
146
+ if not os.path.exists(pifile):
147
+ if rank == size - 1:
148
+ print('{} does not exist'.format(pifile))
149
+ print('python3 pi_chudnovsky.py <power> [pifile]')
150
+ sys.exit(0)
151
+ outfile = 'pi.txt'
152
+
153
+ A = mpz(13591409)
154
+ B = mpz(545140134)
155
+ C = mpz(640320)
156
+ C3over24 = mpz(C**3 / 24)
157
+
158
+ n = 2 ** power
159
+ digits = n * 14
160
+ # gmpy2の浮動小数の精度として、求められる円周率の桁数を設定する。
161
+ mp.get_context().precision = int(digits * mp.log2(10))
162
+
163
+ if rank == size - 1:
164
+ print('calculating...')
165
+ start = MPI.Wtime()
166
+ PQT = calc_PQT_root(n)
167
+ if rank == size - 1:
168
+ temp1 = C * mp.sqrt(C) * PQT[1]
169
+ temp2 = 12 * (PQT[2] + A * PQT[1])
170
+ pi = temp1 / temp2
171
+ end = MPI.Wtime()
172
+ with open(outfile, 'w') as f:
173
+ f.write(str(pi))
174
+ print('checking...')
175
+ n = check_pi(outfile, pifile)
176
+ print('time = {:.2f} sec.'.format(end - start))
177
+ print(f'match = {n}')
178
+ ```
179
+
180
+ 出力:
181
+ … 途中省略 …
182
+ calculating...
183
+ [debian10:08494] *** Process received signal ***
184
+ [debian10:08494] Signal: Segmentation fault (11)
185
+ [debian10:08494] Signal code: (128)
186
+ [debian10:08494] Failing at address: (nil)
187
+ [debian10:08494] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x14140)[0x7fc4662bc140]
188
+ [debian10:08494] [ 1] /usr/local/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_put+0x17)[0x7fc45f6f6a87]
189
+ [debian10:08494] [ 2] /usr/local/lib/openmpi/mca_btl_tcp.so(+0x9087)[0x7fc4640b5087]
190
+ [debian10:08494] [ 3] /usr/local/lib/libopen-pal.so.40(opal_libevent2022_event_base_loop+0x697)[0x7fc4657c6247]
191
+ [debian10:08494] [ 4] /usr/local/lib/libopen-pal.so.40(+0x37256)[0x7fc46577f256]
192
+ [debian10:08494] [ 5] /usr/local/lib/libopen-pal.so.40(opal_progress+0x84)[0x7fc46577f3a4]
193
+ [debian10:08494] [ 6] /usr/local/lib/libopen-pal.so.40(ompi_sync_wait_mt+0xb5)[0x7fc4657859b5]
194
+ [debian10:08494] [ 7] /usr/local/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x1292)[0x7fc45f6e6b82]
195
+ [debian10:08494] [ 8] /usr/local/lib/libmpi.so.40(PMPI_Send+0x11b)[0x7fc46599372b]
196
+ [debian10:08494] [ 9] /usr/lib/python3/dist-packages/mpi4py/MPI.cpython-39-x86_64-linux-gnu.so(+0xe45b8)[0x7fc465b155b8]
197
+ [debian10:08494] [10] python3[0x53f350]
198
+ [debian10:08494] [11] python3(_PyObject_MakeTpCall+0x39b)[0x51d89b]
199
+ [debian10:08494] [12] python3(_PyEval_EvalFrameDefault+0x5f7f)[0x517a0f]
200
+ [debian10:08494] [13] python3(_PyFunction_Vectorcall+0x1a3)[0x528b63]
201
+ [debian10:08494] [14] python3(_PyEval_EvalFrameDefault+0x525)[0x511fb5]
202
+ [debian10:08494] [15] python3[0x5106ed]
203
+ [debian10:08494] [16] python3(_PyEval_EvalCodeWithName+0x47)[0x510497]
204
+ [debian10:08494] [17] python3(PyEval_EvalCode+0x23)[0x5f5be3]
205
+ [debian10:08494] [18] python3[0x619de7]
206
+ [debian10:08494] [19] python3[0x615610]
207
+ [debian10:08494] [20] python3[0x619d79]
208
+ [debian10:08494] [21] python3(PyRun_SimpleFileExFlags+0x196)[0x619816]
209
+ [debian10:08494] [22] python3(Py_RunMain+0x2b3)[0x60d4e3]
210
+ [debian10:08494] [23] python3(Py_BytesMain+0x29)[0x5ea6e9]
211
+ [debian10:08494] [24] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xea)[0x7fc465f6cd0a]
212
+ [debian10:08494] [25] python3(_start+0x2a)[0x5ea5ea]
213
+ [debian10:08494] *** End of error message ***
214
+ --------------------------------------------------------------------------
215
+ Primary job terminated normally, but 1 process returned
216
+ a non-zero exit code. Per user-direction, the job has been aborted.
217
+ --------------------------------------------------------------------------
218
+ --------------------------------------------------------------------------
219
+ mpirun noticed that process rank 7 with PID 8494 on node 192.168.10.128 exited on signal 11 (Segmentation fault).
220
+ --------------------------------------------------------------------------
221
+
222
+