各列について数値の並びが昇順になるような行の並びを求める問題でしょうか。
基本的には総当たりで解を求める必要があります。
再帰を使えば枝刈りできそうですが、とりあえず素直に組んでみました。
Python
1import numpy as np
2
3def search( ary):
4 row_cnt = ary.shape[0]
5 col_cnt = ary.shape[1]
6
7 # 指定列の値が列方向に昇順に並んでいるか
8 def is_sorted(col,rows):
9 prev = 0
10 for i in range(row_cnt):
11 v = ary[rows[i],col]
12 if np.isnan(v):
13 continue
14 if prev > v:
15 return False
16 prev = v
17 return True
18
19 # 条件を満たすか
20 # rows : 行の位置配列
21 def is_match(rows):
22 for col in range(col_cnt):
23 if not is_sorted(col,rows):
24 return False
25 return True
26
27 # 行の並びを総当たり
28 import itertools
29 for rows in itertools.permutations(range(row_cnt)):
30 if is_match(rows):
31 return ary[rows,:] # 見つかれば終了
32
33
34import pandas as pd
35from io import StringIO
36f = StringIO("""c1,c2,c3,c4,c5
371,2,,,
38,,,2,1
39,1,2,,
40,4,4,,
41,,1,1,
422,3,,,
43,,,3,2""")
44ary = pd.read_csv(f).values
45ret = search(ary)
46print(ret)
47"""
48[[nan nan 1. 1. nan]
49 [nan nan nan 2. 1.]
50 [nan 1. 2. nan nan]
51 [ 1. 2. nan nan nan]
52 [ 2. 3. nan nan nan]
53 [nan 4. 4. nan nan]
54 [nan nan nan 3. 2.]]
55"""
再帰版
総当たりよりは速いですが、15行程度が限界ですね。
Python
1
2import numpy as np
3
4def search( ary):
5 row_cnt = ary.shape[0]
6 col_cnt = ary.shape[1]
7
8 # 条件を満たすか
9 # row : 行の位置
10 # mins : 現時点の各列の最小値
11 def is_match(row,mins):
12 for col in range(col_cnt):
13 v = ary[row,col]
14 if np.isnan(v):
15 continue
16 if v < mins[col]:
17 return False
18 mins[col] = v # 最小値を更新
19 return True
20
21 # rows : 行位置の配列
22 # mins : 現時点の各列の最小値
23 def search_row(rows,mins):
24 if len(rows) == row_cnt:
25 return rows
26
27 rows_set = set(rows)
28 for row in range(row_cnt):
29 if row in rows_set: # 重複は除く
30 continue
31 next_mins = mins.copy()
32 if is_match(row,next_mins):
33 ret = search_row(rows+[row],next_mins)
34 if ret:
35 return ret
36
37 rows = search_row([],np.zeros(col_cnt))
38 return ary[rows,:]
39
40
41import pandas as pd
42from io import StringIO
43f = StringIO("""c1,c2,c3,c4,c5
44,,7,8,
451,2,,,
46,,6,6,
474,7,,,
48,,,2,1
49,,,7,4
50,1,2,,
51,4,4,,
52,,1,1,
532,3,,,
54,,,3,2
55,,3,4,
56,5,5,,
57,,,5,3
583,6,,,""")
59ary = pd.read_csv(f).values
60
61ret = search(ary)
62print(ret)
63"""
64[[nan nan 1. 1. nan]
65 [nan nan nan 2. 1.]
66 [nan 1. 2. nan nan]
67 [ 1. 2. nan nan nan]
68 [ 2. 3. nan nan nan]
69 [nan nan nan 3. 2.]
70 [nan nan 3. 4. nan]
71 [nan 4. 4. nan nan]
72 [nan 5. 5. nan nan]
73 [nan nan nan 5. 3.]
74 [nan nan 6. 6. nan]
75 [nan nan nan 7. 4.]
76 [nan nan 7. 8. nan]
77 [ 3. 6. nan nan nan]
78 [ 4. 7. nan nan nan]]
79"""
挿入ソート版
題意から以下の処理でもよさそうです。
再帰版よりもはるかに速く処理できます。
Python
1import numpy as np
2import pprint
3
4import pandas as pd
5from io import StringIO
6f = StringIO("""c1,c2,c3,c4,c5
7,,7,8,
81,2,,,
9,,6,6,
104,7,,,
11,,,2,1
12,,,7,4
136,9,,,
14,1,2,,
15,,,9,5
16,4,4,,
17,,1,1,
182,3,,,
195,8,,,
20,,,3,2
21,,3,4,
22,5,5,,
23,,,5,3
243,6,,,""")
25ary = pd.read_csv(f).values.tolist()
26
27
28ret = []
29
30# 各列について左から順に処理
31col_cnt = len(ary[0])
32for c in range(col_cnt):
33
34 # 対象列がnanでない行のみ抜き出す
35 rows = []
36 for r in ary[::-1]:
37 if not np.isnan(r[c]):
38 rows.append(r)
39 ary.remove(r)
40
41 # 結果配列に列値が昇順になるように挿入していく
42 for row in rows:
43 is_ins = False
44 for idx,ret_row in enumerate(ret):
45 if row[c] < ret_row[c]:
46 ret.insert(idx,row)
47 is_ins = True
48 break
49 if not is_ins:
50 ret.append(row)
51
52pprint.pprint(ret)
53"""
54[[nan, nan, 1.0, 1.0, nan],
55 [nan, 1.0, 2.0, nan, nan],
56 [1.0, 2.0, nan, nan, nan],
57 [2.0, 3.0, nan, nan, nan],
58 [nan, nan, nan, 2.0, 1.0],
59 [nan, nan, nan, 3.0, 2.0],
60 [nan, nan, 3.0, 4.0, nan],
61 [nan, 4.0, 4.0, nan, nan],
62 [nan, 5.0, 5.0, nan, nan],
63 [3.0, 6.0, nan, nan, nan],
64 [4.0, 7.0, nan, nan, nan],
65 [5.0, 8.0, nan, nan, nan],
66 [6.0, 9.0, nan, nan, nan],
67 [nan, nan, nan, 5.0, 3.0],
68 [nan, nan, 6.0, 6.0, nan],
69 [nan, nan, nan, 7.0, 4.0],
70 [nan, nan, 7.0, 8.0, nan],
71 [nan, nan, nan, 9.0, 5.0]]
72"""