前提・実現したいこと
開始と終了レコードを結合するスクリプトの最適な方法が知りたいです。
みなさまのベストプラクティスを教えて頂ければ幸いです。
ライブラリなどの指定はありません。
よろしくお願いいたします。
追記
・短いコードで実現できる方法を探しています。
・データ1において試行1、試行2は問題なく動作しています。
・データ2を追加しました。
・試行3を追加しました。データ2の結合はうまくいきますがデータ1の結合がおかしくなります。
・試行4追加しました。シンプルなループです。
該当のソースコード
python
1import pandas as pd 2 3df1 = pd.DataFrame(data=[ 4 ['siro','ED','2020-01-01 00:35:00','ver0.1'] 5 ,['siro','ST','2020-01-01 00:35:00','ver0.1'] 6 ,['siro','ED','2020-01-01 10:40:00','ver0.1'] 7 ,['siro','ST','2020-01-30 23:00:00','ver0.1'] 8], columns=['name', 'status','dtime','ver']) 9 10df2 = pd.DataFrame( 11 data=[ 12 ["siro", "ED", "2020-01-01 00:34:00", "ver0.1"], 13 ["siro", "ST", "2020-01-01 00:35:00", "ver0.1"], 14 ["siro", "ST", "2020-01-01 00:35:00", "ver0.1"], 15 ["siro", "ED", "2020-01-01 10:40:00", "ver0.1"], 16 ["siro", "ED", "2020-01-30 23:00:00", "ver0.1"], 17 ], 18 columns=["name", "status", "dtime", "ver"], 19) 20# 結合イメージ(一対一の結合、ただし未結合も残す) 21# SELECT 22# *結合イメージ(一対一の結合、ただし未結合も残す) 23# FROM 24# df_st 25# INNER JOIN 26# df_ed 27# ON 28# df_st.name = df_ed.name AND 29# df_st.ver = df_ed.ver AND 30# df_st.index < df_ed.index 31 32# 最終出力イメージ 33# name ver status1 dtime1 status2 dtime2 34# 0 NaN NaN NaN NaN ED 2020-01-01 00:35:00 35# 1 siro ver0.1 ST 2020-01-01 00:35:00 ED 2020-01-01 10:40:00 36# 2 siro ver0.1 ST 2020-01-30 23:00:00 NaN NaN 37 38 **試行1** 39 40# 以前の質問させていただいた時のcan110様の回答を流用にさせていただいております 41last = {} 42def func(row): 43 idx = -1 44 namever1 = row["name"] + row["ver"] 45 namever2 = row.name 46 47 # print(namever1, namever2) 48 if row["status"] == "ED": 49 last[namever1] = namever2 50 else: 51 idx = last.pop(namever1, idx) 52 # 終了行に、対応する開始行の位置をセット 53 if idx >= 0: 54 df.loc[idx, "st"] = namever2 55 56 57# 逆順に走査して終了行の直近の開始行を決定 58df["st"] = -1 59df.iloc[::-1].apply(func, axis=1) 60 61# 開始、終了行同士を結合 62df = pd.merge( 63 df[df["status"] == "ST"], 64 df[df["status"] == "ED"], 65 left_index=True, 66 right_on="st", 67 how="outer", 68 suffixes=("1", "2"), 69) 70 71# # 開始行のない終了行に必要な情報をセットして元レコード順に並び替え 72df = df.reset_index() 73rows = df["st"] < 0 74df.loc[rows, "st"] = df.loc[rows, "index"] 75df.loc[rows, "name1"] = df.loc[rows, "name2"] 76df = df.sort_values(["st"]) 77 78# 必要な列のみ 79df = df.loc[:, ["name1", "status1", "ver1", "dtime1", "status2", "ver2", "dtime2"]] 80print(df) 81# 試行1結果: 82# name1 status1 ver1 dtime1 status2 ver2 dtime2 83#2 siro NaN NaN NaN ED ver0.1 2020-01-01 00:35:00 84#0 siro ST ver0.1 2020-01-01 00:35:00 ED ver0.1 2020-01-01 10:40:00 85#1 siro ST ver0.1 2020-01-30 23:00:00 NaN NaN NaN 86 87**試行2** 88 89def test(df): 90 df["n"] = df.groupby([df["name"], df["ver"]])["status"].cumcount() - ( 91 df["status"] == "ED" 92 ) 93 odf = pd.merge( 94 df[df["status"] == "ST"], 95 df[df["status"] == "ED"], 96 on=["name", "ver", "n"], 97 how="outer", 98 suffixes=("1", "2"), 99 ) 100 print(odf) 101 102test(df1) 103print("") 104test(df2) 105 106# 試行2 データ1結果:成功 107# name status1 dtime1 ver status2 dtime2 108# 0 siro ST 2020-01-01 00:35:00 ver0.1 ED 2020-01-01 10:40:00 109# 1 siro ST 2020-01-30 23:00:00 ver0.1 NaN NaN 110# 2 siro NaN NaN ver0.1 ED 2020-01-01 00:35:00 111 112# 試行2 データ2結果:失敗 113# name status1 dtime1 ver n status2 dtime2 114# 0 siro ST 2020-01-01 00:35:00 ver0.1 1 NaN NaN 115# 1 siro ST 2020-01-01 00:35:00 ver0.1 2 ED 2020-01-01 10:40:00 116# 2 siro NaN NaN ver0.1 -1 ED 2020-01-01 00:34:00 117# 3 siro NaN NaN ver0.1 3 ED 2020-01-30 23:00:00 118 119# 試行2データ2理想の結果 120# name status1 dtime1 ver status2 dtime2 121# 0 siro ST 2020-01-01 00:35:00 ver0.1 ED 2020-01-01 10:40:00 122# 1 siro ST 2020-01-01 00:35:00 ver0.1 ED 2020-01-30 23:00:00 123# 2 siro NaN NaN ver0.1 ED 2020-01-01 00:34:00 124 125# 試行3 試行2とは逆でデータ2はイメージ通りだがデータ1がうまくいかない 126 127def test3(df): 128 df["n"] = ( 129 df.groupby([df["name"], df["ver"]])["status"].cumcount() 130 - (df["status"] == "ED") 131 + (df["status"] == "ST") 132 ) 133 134 odf = pd.merge( 135 df[df["status"] == "ST"], 136 df[df["status"] == "ED"], 137 on=["name", "ver", "n"], 138 how="outer", 139 suffixes=("1", "2"), 140 ) 141 print(odf) 142 143 144print("\ntest3") 145 146test3(df1) 147print("") 148test3(df2)``` 149# 試行3 データ1結果:失敗 150# name status1 dtime1 ver n status2 dtime2 151# 0 siro ST 2020-01-01 00:35:00 ver0.1 2 NaN NaN 152# 1 siro ST 2020-01-30 23:00:00 ver0.1 4 NaN NaN 153# 2 siro NaN NaN ver0.1 -1 ED 2020-01-01 00:35:00 154# 3 siro NaN NaN ver0.1 1 ED 2020-01-01 10:40:00 155# 試行3 データ1理想の結果 156# name status1 dtime1 ver status2 dtime2 157# 0 siro ST 2020-01-01 00:35:00 ver0.1 ED 2020-01-01 10:40:00 158# 1 siro ST 2020-01-30 23:00:00 ver0.1 NaN NaN 159# 2 siro NaN NaN ver0.1 ED 2020-01-01 00:35:00 160 161 162# 試行2 データ2結果:成功 163 164# name status1 dtime1 ver n status2 dtime2 165# 0 siro ST 2020-01-01 00:35:00 ver0.1 2 ED 2020-01-01 10:40:00 166# 1 siro ST 2020-01-01 00:35:00 ver0.1 3 ED 2020-01-30 23:00:00 167# 2 siro NaN NaN ver0.1 -1 ED 2020-01-01 00:34:00 168 169# 試行4 ループさせてみた 170 171def test4(df): 172 dfs = df[df["status"] == "ST"] 173 df["p"] = False 174 175 cols = ["dtime1", "dtime2", "name", "ver"] 176 outdf = pd.DataFrame(index=[], columns=cols) 177 178 # start loop 179 for i, row1 in dfs.iterrows(): 180 dfe = df[i + 1 :] 181 dfe = dfe[dfe["status"] == "ED"] 182 # df.loc[i, "p"] = 183 184 # end loop 185 for j, row2 in dfe.iterrows(): 186 if row1["name"] == row2["name"] and row1["ver"] == row2["ver"]: 187 if df.loc[j, "p"] == True: 188 continue 189 addlist = [row1["dtime"], row2["dtime"], row1["name"], row1["ver"]] 190 record = pd.Series(addlist, index=outdf.columns) 191 outdf = outdf.append(record, ignore_index=True) 192 193 df.loc[i, "p"] = True 194 df.loc[j, "p"] = True 195 break 196 # print(df) 197 # 不一致結合 198 dfn = df[df["p"] == False] 199 alist = [] 200 201 for i, row in dfn.iterrows(): 202 addlist = None 203 record = None 204 addic = {} 205 if row["status"] == "ST": 206 addic["dtime1"] = row1["dtime"] 207 addic["dtime2"] = "" 208 addic["name"] = row1["name"] 209 addic["ver"] = row1["ver"] 210 alist.append(addic) 211 212 elif row["status"] == "ED": 213 addic["dtime1"] = "" 214 addic["dtime2"] = row["dtime"] 215 addic["name"] = row["name"] 216 addic["ver"] = row["ver"] 217 alist.append(addic) 218 # print(addic) 219 # print(df) 220 221 outdf = outdf.append(alist, ignore_index=True) 222 return outdf 223 224# 試行4データ1結果:成功 225rslt1 226 dtime1 dtime2 name ver 2270 2020-01-01 00:35:00 2020-01-01 10:40:00 siro ver0.1 2281 2020-01-01 00:35:00 siro ver0.1 2292 2020-01-30 23:00:00 siro ver0.1 230 231# 試行4データ1結果:成功 232 dtime1 dtime2 name ver 2330 2020-01-01 00:35:00 2020-01-01 10:40:00 siro ver0.1 2341 2020-01-01 00:35:00 2020-01-30 23:00:00 siro ver0.1 2352 2020-01-01 00:34:00 siro ver0.1
補足情報(FW/ツールのバージョンなど)
python 3.6以上