for ループを使わない方法を示していないので御参考です。
質問へのコメントの
同じ for ループでも、ndarray に変えるだけでそこそこ改善するはずです。
には同意見だったので実際に確かめてみました。
あくまで,下記のふたつのコードを「MacOS(M1) 13.6.1, Python 3.11.6, numpy 1.26.1」(私の環境)で実行した場合にはなりますが,処理時間は<オリジナル>が約 45秒だったのに対して<改善案>では約 10秒でした。
なお,式の意味を理解しているわけではありませんが,<オリジナル>の24行目は r
ではなく q
ではないかと思い変更しています。
<オリジナル>
Python
1 import pandas as pd
2 import numpy as np
3 from numpy import random
4 import math
5
6 N , M = 100 , 200
7 rng = random . default_rng ( 103 )
8
9 prec_ = np . array ( rng . random ( M * M ) . reshape ( M , M ) )
10 prec_diag_zero = prec_ - np . diag ( np . diag ( prec_ ) )
11
12 test = pd . DataFrame ( rng . random ( N * M ) . reshape ( N , M ) )
13 df_scores = pd . DataFrame ( np . empty_like ( test ) )
14
15 for p in range ( test . shape [ 0 ] ) :
16 row = [ ]
17 for q in range ( test . shape [ 1 ] ) :
18 acc = 0
19 for r in range ( test . shape [ 1 ] ) :
20 if not math . isnan ( test [ test . columns [ r ] ] [ test . index [ p ] ] ) :
21 acc += ( prec_diag_zero [ q ] [ r ]
22 * ( test [ test . columns [ r ] ] [ test . index [ p ] ]
23 - test [ test . columns [ r ] ] . mean ( ) ) )
24 acc = ( test [ test . columns [ q ] ] [ test . index [ p ] ]
25 - ( test [ test . columns [ q ] ] . mean ( )
26 - ( 1 / prec_ [ q ] [ q ] ) * acc ) ) ** 2
27 score = ( - 0.5 * math . log ( prec_ [ q ] [ q ] / ( 2 * math . pi ) )
28 + 0.5 * prec_ [ q ] [ q ] * acc )
29 row . append ( score )
30 df_scores . iloc [ p ] = row
31
32 print ( df_scores )
33 # 0 1 2 ... 197 198 199
34 # 0 15.967284 11.300599 10.289985 ... 9.177044 5.273374 3.993757
35 # 1 30.356535 25.223625 14.959857 ... 37.244809 19.037071 17.883623
36 # 2 20.163732 25.146996 8.948390 ... 7.535858 9.392445 26.143601
37 # 3 24.453619 1.481159 1.398596 ... 2.389492 10.123730 4.762429
38 # 4 17.849431 22.829211 3.040402 ... 1.325447 4.508241 7.728260
39 # .. ... ... ... ... ... ... ...
40 # 95 4.672619 1.595823 1.401191 ... 1.654326 1.691618 1.297010
41 # 96 7.553522 6.044351 2.154869 ... 2.220435 1.291333 3.721267
42 # 97 10.679974 26.306743 13.505873 ... 21.988895 8.507998 20.977496
43 # 98 5.983548 3.172999 3.922773 ... 1.883454 2.108250 2.091508
44 # 99 3.986986 1.896011 15.165940 ... 1.468343 2.619154 2.027559
45 #
46 # [100 rows x 200 columns]
<改善案>
Python
1 import pandas as pd
2 import numpy as np
3 from numpy import random
4 import math
5
6 N , M = 100 , 200
7 rng = random . default_rng ( 103 )
8
9 prec_ = np . array ( rng . random ( M * M ) . reshape ( M , M ) )
10 prec_diag_zero = prec_ - np . diag ( np . diag ( prec_ ) )
11
12 test = pd . DataFrame ( rng . random ( N * M ) . reshape ( N , M ) )
13 df_scores = pd . DataFrame ( np . empty_like ( test ) )
14
15 test_np = test . to_numpy ( )
16
17 for p in range ( test . shape [ 0 ] ) :
18 row = [ ]
19 for q in range ( test . shape [ 1 ] ) :
20 acc = 0
21 for r in range ( test . shape [ 1 ] ) :
22 if not np . isnan ( test_np [ p , r ] ) :
23 acc += ( prec_diag_zero [ q , r ]
24 * ( test_np [ p , r ] - test_np [ : , r ] . mean ( ) ) )
25 acc = ( test_np [ p , q ]
26 - ( test_np [ : , q ] . mean ( ) - ( 1 / prec_ [ q , q ] ) * acc ) ) ** 2
27 score = ( - 0.5 * math . log ( prec_ [ q , q ] / ( 2 * math . pi ) )
28 + 0.5 * prec_ [ q , q ] * acc )
29 row . append ( score )
30 df_scores . iloc [ p ] = row
31
32 print ( df_scores )
33 # 0 1 2 ... 197 198 199
34 # 0 15.967284 11.300599 10.289985 ... 9.177044 5.273374 3.993757
35 # 1 30.356535 25.223625 14.959857 ... 37.244809 19.037071 17.883623
36 # 2 20.163732 25.146996 8.948390 ... 7.535858 9.392445 26.143601
37 # 3 24.453619 1.481159 1.398596 ... 2.389492 10.123730 4.762429
38 # 4 17.849431 22.829211 3.040402 ... 1.325447 4.508241 7.728260
39 # .. ... ... ... ... ... ... ...
40 # 95 4.672619 1.595823 1.401191 ... 1.654326 1.691618 1.297010
41 # 96 7.553522 6.044351 2.154869 ... 2.220435 1.291333 3.721267
42 # 97 10.679974 26.306743 13.505873 ... 21.988895 8.507998 20.977496
43 # 98 5.983548 3.172999 3.922773 ... 1.883454 2.108250 2.091508
44 # 99 3.986986 1.896011 15.165940 ... 1.468343 2.619154 2.027559
45 #
46 # [100 rows x 200 columns]