回答編集履歴
5
test
CHANGED
@@ -11,14 +11,14 @@
|
|
11
11
|
2024/06/12 9:10:20,AAA,100,START
|
12
12
|
2024/06/12 9:15:20,AAA,200,START
|
13
13
|
2024/06/12 9:20:10,AAA,200,END
|
14
|
+
2024/06/12 10:00:00,AAA,,END
|
14
|
-
2024/06/12 1
|
15
|
+
2024/06/12 11:20:10,AAA,100,START
|
15
|
-
2024/06/12 11:20:10,AAA,100,END
|
16
16
|
2024/06/12 12:20:10,AAA,100,END
|
17
17
|
2024/06/12 13:10:00,AAA,100,START
|
18
18
|
2024/06/12 14:30:10,AAA,,END
|
19
19
|
2024/06/12 15:00:00,AAA,100,START
|
20
|
+
2024/06/12 15:30:00,AAA,,END
|
20
|
-
2024/06/12 1
|
21
|
+
2024/06/12 16:20:10,AAA,100,START
|
21
|
-
2024/06/12 16:20:10,AAA,100,END
|
22
22
|
2024/06/12 18:20:10,AAA,100,END
|
23
23
|
'''
|
24
24
|
|
@@ -27,30 +27,29 @@
|
|
27
27
|
print(df)
|
28
28
|
|
29
29
|
#
|
30
|
-
m = df['プロセス'].isna()
|
31
|
-
idx = df[m].index
|
32
30
|
df2 = pd.DataFrame()
|
31
|
+
for _, grp in df.groupby('ユーザ', as_index=False):
|
32
|
+
m = grp['プロセス'].isna()
|
33
|
+
idx = grp[m].index
|
33
|
-
for nth, g in
|
34
|
+
for nth, g in grp[~m].groupby(m.cumsum()):
|
34
|
-
# pivot
|
35
|
+
# pivot
|
35
|
-
key = g.groupby(['状態', '
|
36
|
+
key = g.groupby(['状態', 'プロセス']).cumcount()
|
36
|
-
dfx = g.pivot_table(index=[key, 'ユーザ', 'プロセス'], columns='状態', values='日時')
|
37
|
+
dfx = g.pivot_table(index=[key, 'ユーザ', 'プロセス'], columns='状態', values='日時').reset_index()
|
37
|
-
|
38
|
+
if 'END' not in dfx.columns: dfx['END'] = pd.NaT
|
38
|
-
# NULL値を補完(最後のブロックは除く)
|
39
|
+
# NULL値を補完(最後のブロックは除く)
|
39
|
-
if nth < len(idx):
|
40
|
+
if nth < len(idx):
|
40
|
-
|
41
|
+
dfx.loc[dfx['END'].isna(), 'END'] = grp.loc[idx[nth], '日時']
|
41
|
-
dfx.loc[cond, 'END'] = df.loc[idx[nth], '日時']
|
42
|
-
df2 = pd.concat([df2, dfx])
|
42
|
+
df2 = pd.concat([df2, dfx])
|
43
43
|
|
44
|
-
df2 = df2.reset_index(drop=True)
|
44
|
+
df2 = df2[['ユーザ', 'プロセス', 'START', 'END']].reset_index(drop=True)
|
45
45
|
print(df2)
|
46
46
|
```
|
47
|
-
|
48
47
|
| ユーザ | プロセス | START | END |
|
49
48
|
|:---------|-----------:|:--------------------|:--------------------|
|
50
|
-
| AAA | 100 | 2024-06-12 09:10:20 | 2024-06-12 1
|
49
|
+
| AAA | 100 | 2024-06-12 09:10:20 | 2024-06-12 10:00:00 |
|
51
50
|
| AAA | 200 | 2024-06-12 09:15:20 | 2024-06-12 09:20:10 |
|
52
|
-
| AAA | 100 | 2024-06-12 1
|
51
|
+
| AAA | 100 | 2024-06-12 11:20:10 | 2024-06-12 12:20:10 |
|
53
52
|
| AAA | 100 | 2024-06-12 13:10:00 | 2024-06-12 14:30:10 |
|
54
|
-
| AAA | 100 | 2024-06-12 15:00:00 | 2024-06-12 1
|
53
|
+
| AAA | 100 | 2024-06-12 15:00:00 | 2024-06-12 15:30:00 |
|
55
|
-
| AAA | 100 | 2024-06-12 1
|
54
|
+
| AAA | 100 | 2024-06-12 16:20:10 | 2024-06-12 18:20:10 |
|
56
55
|
|
4
test
CHANGED
@@ -37,7 +37,8 @@
|
|
37
37
|
.reset_index()[['ユーザ', 'プロセス', 'START', 'END']]
|
38
38
|
# NULL値を補完(最後のブロックは除く)
|
39
39
|
if nth < len(idx):
|
40
|
-
|
40
|
+
cond = dfx['END'].isna() & dfx['ユーザ'].eq(df.loc[idx[nth], 'ユーザ'])
|
41
|
+
dfx.loc[cond, 'END'] = df.loc[idx[nth], '日時']
|
41
42
|
df2 = pd.concat([df2, dfx])
|
42
43
|
|
43
44
|
df2 = df2.reset_index(drop=True)
|
3
test
CHANGED
@@ -30,7 +30,7 @@
|
|
30
30
|
m = df['プロセス'].isna()
|
31
31
|
idx = df[m].index
|
32
32
|
df2 = pd.DataFrame()
|
33
|
-
for nth,
|
33
|
+
for nth, g in df[~m].groupby(m.cumsum()):
|
34
34
|
# pivot
|
35
35
|
key = g.groupby(['状態', 'ユーザ', 'プロセス']).cumcount()
|
36
36
|
dfx = g.pivot_table(index=[key, 'ユーザ', 'プロセス'], columns='状態', values='日時')\
|
2
test
CHANGED
@@ -1,60 +1,55 @@
|
|
1
|
-
|
1
|
+
> 改めて以下のデータで確認したところ、期待通りに動作しませんでした。
|
2
|
+
|
3
|
+
当初、プロセスの値が NULL であるデータの時刻以降にある"END"状態のデータも組み合わせの対象にするのかと思っていたのですが、NULL値のデータが出現するまでの範囲内で組み合わせるとのことなので、以下の様に書き換えました。
|
4
|
+
|
2
5
|
```python
|
3
6
|
import pandas as pd
|
4
7
|
import io
|
5
8
|
|
6
9
|
csv_data = '''
|
7
10
|
日時,ユーザ,プロセス,状態
|
8
|
-
2024/06/12 9:10:20,AAA,
|
11
|
+
2024/06/12 9:10:20,AAA,100,START
|
12
|
+
2024/06/12 9:15:20,AAA,200,START
|
9
|
-
2024/06/12 9:20:10,AAA,
|
13
|
+
2024/06/12 9:20:10,AAA,200,END
|
10
14
|
2024/06/12 10:00:00,AAA,100,START
|
11
|
-
2024/06/12 10:10:00,AAA,200,START
|
12
|
-
2024/06/12 10:20:10,AAA,300,START
|
13
|
-
2024/06/12 1
|
15
|
+
2024/06/12 11:20:10,AAA,100,END
|
14
|
-
2024/06/12 12:00:10,AAA,100,START
|
15
|
-
2024/06/12 12:10:20,AAA,200,START
|
16
16
|
2024/06/12 12:20:10,AAA,100,END
|
17
|
+
2024/06/12 13:10:00,AAA,100,START
|
18
|
+
2024/06/12 14:30:10,AAA,,END
|
19
|
+
2024/06/12 15:00:00,AAA,100,START
|
20
|
+
2024/06/12 15:30:00,AAA,100,START
|
17
|
-
2024/06/12 12
|
21
|
+
2024/06/12 16:20:10,AAA,100,END
|
18
|
-
2024/06/12 16:10:20,BBB,500,START
|
19
|
-
2024/06/12 1
|
22
|
+
2024/06/12 18:20:10,AAA,100,END
|
20
|
-
2024/06/13 8:10:20,CCC,500,START
|
21
23
|
'''
|
22
24
|
|
23
25
|
df = pd.read_csv(io.StringIO(csv_data), dtype=str)
|
24
26
|
df['日時'] = pd.to_datetime(df['日時'])
|
25
27
|
print(df)
|
26
28
|
|
27
|
-
#
|
29
|
+
#
|
28
|
-
# START と END のペアになっていない(START だけしかない)行のインデックスを抽出
|
29
30
|
m = df['プロセス'].isna()
|
30
|
-
idx = [
|
31
|
+
idx = df[m].index
|
32
|
+
df2 = pd.DataFrame()
|
33
|
+
for nth, (_, g) in enumerate(df[~m].groupby(m.cumsum())):
|
34
|
+
# pivot
|
31
|
-
|
35
|
+
key = g.groupby(['状態', 'ユーザ', 'プロセス']).cumcount()
|
32
|
-
|
36
|
+
dfx = g.pivot_table(index=[key, 'ユーザ', 'プロセス'], columns='状態', values='日時')\
|
33
|
-
|
37
|
+
.reset_index()[['ユーザ', 'プロセス', 'START', 'END']]
|
38
|
+
# NULL値を補完(最後のブロックは除く)
|
39
|
+
if nth < len(idx):
|
40
|
+
dfx.loc[dfx['END'].isna(), 'END'] = df.loc[idx[nth], '日時']
|
34
|
-
|
41
|
+
df2 = pd.concat([df2, dfx])
|
35
42
|
|
36
|
-
# 「状態」が START だけしかない行に対応する END 行を元のデータフレームに追加
|
37
|
-
n = df[m].index.to_list()
|
38
|
-
grp = [[k for k in idx if i <= k <= j] for i, j in zip([0]+n, n)]
|
39
|
-
df2 = pd.concat([df[~m], *[df.loc[[i]].assign(プロセス = df.loc[k, 'プロセス'])
|
40
|
-
for i, j in zip(n, grp) for k in j]])
|
41
|
-
df2 = df2.sor
|
43
|
+
df2 = df2.reset_index(drop=True)
|
42
|
-
|
43
|
-
# pivot
|
44
|
-
df2['キー'] = df2.groupby(['状態', 'ユーザ', 'プロセス']).cumcount()
|
45
|
-
df2 = df2.pivot_table(index=['キー', 'ユーザ', 'プロセス'], columns='状態', values='日時').reset_index()
|
46
|
-
df2 = df2[['ユーザ', 'プロセス', 'START', 'END']]
|
47
44
|
print(df2)
|
48
45
|
```
|
49
46
|
|
50
47
|
| ユーザ | プロセス | START | END |
|
51
48
|
|:---------|-----------:|:--------------------|:--------------------|
|
49
|
+
| AAA | 100 | 2024-06-12 09:10:20 | 2024-06-12 11:20:10 |
|
50
|
+
| AAA | 200 | 2024-06-12 09:15:20 | 2024-06-12 09:20:10 |
|
52
|
-
| AAA | 100 | 2024-06-12 10:00:00 | 2024-06-12 1
|
51
|
+
| AAA | 100 | 2024-06-12 10:00:00 | 2024-06-12 12:20:10 |
|
53
|
-
| AAA |
|
52
|
+
| AAA | 100 | 2024-06-12 13:10:00 | 2024-06-12 14:30:10 |
|
54
|
-
| AAA | 300 | 2024-06-12 10:20:10 | 2024-06-12 10:30:10 |
|
55
|
-
| AAA | 50 | 2024-06-12 09:10:20 | 2024-06-12 09:20:10 |
|
56
|
-
| BBB | 500 | 2024-06-12 16:10:20 | 2024-06-12 16:20:10 |
|
57
|
-
| CCC | 500 | 2024-06-13 08:10:20 | NaT |
|
58
|
-
| AAA | 100 | 2024-06-12 1
|
53
|
+
| AAA | 100 | 2024-06-12 15:00:00 | 2024-06-12 16:20:10 |
|
59
|
-
| AAA |
|
54
|
+
| AAA | 100 | 2024-06-12 15:30:00 | 2024-06-12 18:20:10 |
|
60
55
|
|
1
test
CHANGED
@@ -36,8 +36,8 @@
|
|
36
36
|
# 「状態」が START だけしかない行に対応する END 行を元のデータフレームに追加
|
37
37
|
n = df[m].index.to_list()
|
38
38
|
grp = [[k for k in idx if i <= k <= j] for i, j in zip([0]+n, n)]
|
39
|
-
df2 = pd.concat([df[~m], *
|
39
|
+
df2 = pd.concat([df[~m], *[df.loc[[i]].assign(プロセス = df.loc[k, 'プロセス'])
|
40
|
-
|
40
|
+
for i, j in zip(n, grp) for k in j]])
|
41
41
|
df2 = df2.sort_values('日時')
|
42
42
|
|
43
43
|
# pivot
|