質問編集履歴

5

2021/07/30 01:13

投稿

esklia
esklia

スコア81

test CHANGED
File without changes
test CHANGED
@@ -66,6 +66,8 @@
66
66
 
67
67
  ```
68
68
 
69
+ 全コード
70
+
69
71
  # main module
70
72
 
71
73
  import matplotlib.pyplot as plt

4

2021/07/30 01:13

投稿

esklia
esklia

スコア81

test CHANGED
File without changes
test CHANGED
@@ -14,13 +14,175 @@
14
14
 
15
15
 
16
16
 
17
- その時の学習実行コードは以下の通りです。
17
+ その時の学習実行コードは以下の通りです。(学習実行コードは下のほうへ移動しました。)
18
18
 
19
19
 
20
20
 
21
21
  この大量のメモリ消費の原因と、解決策をご教授いただけませんか。
22
22
 
23
+ 追記事項
24
+
25
+ バージョン
26
+
23
- ```
27
+ ```
28
+
29
+ Python 3.8.10
30
+
31
+ pandas 1.2.5
32
+
33
+ lightgbm 3.1.1
34
+
35
+ notebook 6.4.0 py38haa95532_0
36
+
37
+ numba 0.53.1 py38hf11a4ad_0
38
+
39
+ numpy 1.20.2 py38ha4e8547_0
40
+
41
+ numpy-base 1.20.2 py38hc2deb75_0
42
+
43
+ ```
44
+
45
+ 生のトレインデータ
46
+
47
+ ![イメージ説明](ca65e32190e3aec94e43cf4c6fa77a3f.png)
48
+
49
+
50
+
51
+ 加工後のトレインデータ
52
+
53
+ ![イメージ説明](6f629b17b2aeff9cc3e301e668dc2907.png)
54
+
55
+
56
+
57
+ それぞれva_period = 1で実行時の
58
+
59
+ 訓練データ、検証データ
60
+
61
+ 訓練データのy、検証データのyの大きさ
62
+
63
+ (254302, 10) (254302, 10)
64
+
65
+ (254302, 1) (254302, 1)
66
+
67
+ ```
68
+
69
+ # main module
70
+
71
+ import matplotlib.pyplot as plt
72
+
73
+ import seaborn as sns
74
+
75
+ import pandas as pd
76
+
77
+ import pandas_profiling as pdp
78
+
79
+ import numpy as np
80
+
81
+
82
+
83
+ import lightgbm as lgb
84
+
85
+ from sklearn.metrics import log_loss
86
+
87
+
88
+
89
+ import datetime
90
+
91
+ import logging
92
+
93
+ import sys, os
94
+
95
+ sys.path.append('../src/')
96
+
97
+
98
+
99
+ import eda
100
+
101
+ import maprepro as mpre
102
+
103
+ # import config
104
+
105
+ # from utils import setup_logger, ModelFactory
106
+
107
+ path = '../../input'
108
+
109
+ sample = pd.read_csv(f'{path}/sample_submission.csv')
110
+
111
+ store = pd.read_csv(f'{path}/store.csv')
112
+
113
+ test = pd.read_csv(f'{path}/test.csv')
114
+
115
+ train = pd.read_csv(f'{path}/train.csv')
116
+
117
+
118
+
119
+ def mk_ymd(df):
120
+
121
+ df['year'] = df.Date.apply(lambda x: x.split('-')[0]).astype(np.int16)
122
+
123
+ df['month'] = df.Date.apply(lambda x: x.split('-')[1]).astype(np.int16)
124
+
125
+ df['day'] = df.Date.apply(lambda x: x.split('-')[2]).astype(np.int16)
126
+
127
+ df = df.sort_values('Date')
128
+
129
+ return df
130
+
131
+ train = mk_ymd(train)
132
+
133
+ test = mk_ymd(test)
134
+
135
+
136
+
137
+
138
+
139
+ # 時系列データであり、時間に沿って変数periodを設定したとする
140
+
141
+ train['period'] = np.arange(0, len(train)) // (len(train) // 4)
142
+
143
+ train['period'] = np.clip(train['period'], 0, 3)
144
+
145
+ test['period'] = 4
146
+
147
+
148
+
149
+ train['StateHoliday'] = train.StateHoliday.astype('category')
150
+
151
+
152
+
153
+ target = ['Sales']
154
+
155
+ notuse = ['Id','Date','Open']
156
+
157
+ use = ['Store','DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday','year','month','day','period']
158
+
159
+
160
+
161
+ train_y = train[target]
162
+
163
+ train_x = train[use]
164
+
165
+ test_x = test[use]
166
+
167
+
168
+
169
+
170
+
171
+ import warnings
172
+
173
+ warnings.simplefilter('ignore')
174
+
175
+ train_x = eda.reduce_mem_usage(train_x)
176
+
177
+ >>>start size(BEFORE): 57.24 Mb
178
+
179
+ >>>Mem. usage decreased to 19.40 Mb (AFTER:66.1% reduction)
180
+
181
+
182
+
183
+ import gc
184
+
185
+ gc.collect()
24
186
 
25
187
  va_period_list = [1, 2, 3]
26
188
 
@@ -34,6 +196,10 @@
34
196
 
35
197
  tr_y, va_y = train_y[is_tr], train_y[is_va]
36
198
 
199
+ print(tr_x.shape, va_x.shape)
200
+
201
+ print(tr_y.shape, va_y.shape)
202
+
37
203
 
38
204
 
39
205
  lgb_train = lgb.Dataset(tr_x, tr_y)
@@ -48,7 +214,7 @@
48
214
 
49
215
  'seed': 71,
50
216
 
51
- 'verbose': 0,
217
+ 'verbose': 1,
52
218
 
53
219
  'metrics': 'binary_logloss',
54
220
 
@@ -56,7 +222,7 @@
56
222
 
57
223
  }
58
224
 
59
- num_round = 100
225
+ num_round = 10
60
226
 
61
227
 
62
228
 
@@ -74,8 +240,6 @@
74
240
 
75
241
  valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval],
76
242
 
77
-
78
-
79
243
  )
80
244
 
81
245
 
@@ -94,242 +258,6 @@
94
258
 
95
259
  pred = model.predict(test_x)
96
260
 
261
+
262
+
97
- ```
263
+ ```
98
-
99
-
100
-
101
- 追記事項
102
-
103
- バージョン
104
-
105
- ```
106
-
107
- Python 3.8.10
108
-
109
- pandas 1.2.5
110
-
111
- lightgbm 3.1.1
112
-
113
- notebook 6.4.0 py38haa95532_0
114
-
115
- numba 0.53.1 py38hf11a4ad_0
116
-
117
- numpy 1.20.2 py38ha4e8547_0
118
-
119
- numpy-base 1.20.2 py38hc2deb75_0
120
-
121
- ```
122
-
123
- 生のトレインデータ
124
-
125
- ![イメージ説明](ca65e32190e3aec94e43cf4c6fa77a3f.png)
126
-
127
-
128
-
129
- それぞれva_period = 1で実行時の
130
-
131
- 訓練データ、検証データ
132
-
133
- 訓練データのy、検証データのyの大きさ
134
-
135
- (254302, 10) (254302, 10)
136
-
137
- (254302, 1) (254302, 1)
138
-
139
- ```
140
-
141
- # main module
142
-
143
- import matplotlib.pyplot as plt
144
-
145
- import seaborn as sns
146
-
147
- import pandas as pd
148
-
149
- import pandas_profiling as pdp
150
-
151
- import numpy as np
152
-
153
-
154
-
155
- import lightgbm as lgb
156
-
157
- from sklearn.metrics import log_loss
158
-
159
-
160
-
161
- import datetime
162
-
163
- import logging
164
-
165
- import sys, os
166
-
167
- sys.path.append('../src/')
168
-
169
-
170
-
171
- import eda
172
-
173
- import maprepro as mpre
174
-
175
- # import config
176
-
177
- # from utils import setup_logger, ModelFactory
178
-
179
- path = '../../input'
180
-
181
- sample = pd.read_csv(f'{path}/sample_submission.csv')
182
-
183
- store = pd.read_csv(f'{path}/store.csv')
184
-
185
- test = pd.read_csv(f'{path}/test.csv')
186
-
187
- train = pd.read_csv(f'{path}/train.csv')
188
-
189
-
190
-
191
- def mk_ymd(df):
192
-
193
- df['year'] = df.Date.apply(lambda x: x.split('-')[0]).astype(np.int16)
194
-
195
- df['month'] = df.Date.apply(lambda x: x.split('-')[1]).astype(np.int16)
196
-
197
- df['day'] = df.Date.apply(lambda x: x.split('-')[2]).astype(np.int16)
198
-
199
- df = df.sort_values('Date')
200
-
201
- return df
202
-
203
- train = mk_ymd(train)
204
-
205
- test = mk_ymd(test)
206
-
207
-
208
-
209
-
210
-
211
- # 時系列データであり、時間に沿って変数periodを設定したとする
212
-
213
- train['period'] = np.arange(0, len(train)) // (len(train) // 4)
214
-
215
- train['period'] = np.clip(train['period'], 0, 3)
216
-
217
- test['period'] = 4
218
-
219
-
220
-
221
- train['StateHoliday'] = train.StateHoliday.astype('category')
222
-
223
-
224
-
225
- target = ['Sales']
226
-
227
- notuse = ['Id','Date','Open']
228
-
229
- use = ['Store','DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday','year','month','day','period']
230
-
231
-
232
-
233
- train_y = train[target]
234
-
235
- train_x = train[use]
236
-
237
- test_x = test[use]
238
-
239
-
240
-
241
-
242
-
243
- import warnings
244
-
245
- warnings.simplefilter('ignore')
246
-
247
- train_x = eda.reduce_mem_usage(train_x)
248
-
249
- >>>start size(BEFORE): 57.24 Mb
250
-
251
- >>>Mem. usage decreased to 19.40 Mb (AFTER:66.1% reduction)
252
-
253
-
254
-
255
- import gc
256
-
257
- gc.collect()
258
-
259
- va_period_list = [1, 2, 3]
260
-
261
- for va_period in va_period_list:
262
-
263
- is_tr = train_x['period'] < va_period
264
-
265
- is_va = train_x['period'] == va_period
266
-
267
- tr_x, va_x = train_x[is_tr], train_x[is_va]
268
-
269
- tr_y, va_y = train_y[is_tr], train_y[is_va]
270
-
271
- print(tr_x.shape, va_x.shape)
272
-
273
- print(tr_y.shape, va_y.shape)
274
-
275
-
276
-
277
- lgb_train = lgb.Dataset(tr_x, tr_y)
278
-
279
- lgb_eval = lgb.Dataset(va_x, va_y)
280
-
281
-
282
-
283
- # ハイパーパラメータの設定
284
-
285
- params = {'objective': 'binary',
286
-
287
- 'seed': 71,
288
-
289
- 'verbose': 1,
290
-
291
- 'metrics': 'binary_logloss',
292
-
293
- 'force_col_wise':'true' # メモリが足りないから
294
-
295
- }
296
-
297
- num_round = 10
298
-
299
-
300
-
301
- # 学習の実行
302
-
303
- # カテゴリ変数をパラメータで指定している
304
-
305
- # バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
306
-
307
- categorical_features = ['StateHoliday']
308
-
309
- model = lgb.train(params, lgb_train, num_boost_round=num_round,
310
-
311
- categorical_feature=categorical_features,
312
-
313
- valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval],
314
-
315
- )
316
-
317
-
318
-
319
- # バリデーションデータでのスコアの確認
320
-
321
- va_pred = model.predict(va_x)
322
-
323
- score = log_loss(va_y, va_pred)
324
-
325
- print(f'logloss: {score:.4f}')
326
-
327
-
328
-
329
- # 予測
330
-
331
- pred = model.predict(test_x)
332
-
333
-
334
-
335
- ```

3

2021/07/30 01:09

投稿

esklia
esklia

スコア81

test CHANGED
File without changes
test CHANGED
@@ -126,9 +126,15 @@
126
126
 
127
127
 
128
128
 
129
-
129
+ それぞれva_period = 1で実行時の
130
+
130
-
131
+ 訓練データ、検証データ
132
+
131
-
133
+ 訓練データのy、検証データのyの大きさ
134
+
135
+ (254302, 10) (254302, 10)
136
+
137
+ (254302, 1) (254302, 1)
132
138
 
133
139
  ```
134
140
 

2

2021/07/30 01:07

投稿

esklia
esklia

スコア81

test CHANGED
File without changes
test CHANGED
@@ -120,6 +120,14 @@
120
120
 
121
121
  ```
122
122
 
123
+ 生のトレインデータ
124
+
125
+ ![イメージ説明](ca65e32190e3aec94e43cf4c6fa77a3f.png)
126
+
127
+
128
+
129
+
130
+
123
131
 
124
132
 
125
133
  ```
@@ -318,12 +326,4 @@
318
326
 
319
327
 
320
328
 
321
-
322
-
323
-
324
-
325
-
326
-
327
-
328
-
329
- ```
329
+ ```

1

s

2021/07/30 01:04

投稿

esklia
esklia

スコア81

test CHANGED
File without changes
test CHANGED
@@ -95,3 +95,235 @@
95
95
  pred = model.predict(test_x)
96
96
 
97
97
  ```
98
+
99
+
100
+
101
+ 追記事項
102
+
103
+ バージョン
104
+
105
+ ```
106
+
107
+ Python 3.8.10
108
+
109
+ pandas 1.2.5
110
+
111
+ lightgbm 3.1.1
112
+
113
+ notebook 6.4.0 py38haa95532_0
114
+
115
+ numba 0.53.1 py38hf11a4ad_0
116
+
117
+ numpy 1.20.2 py38ha4e8547_0
118
+
119
+ numpy-base 1.20.2 py38hc2deb75_0
120
+
121
+ ```
122
+
123
+
124
+
125
+ ```
126
+
127
+ # main module
128
+
129
+ import matplotlib.pyplot as plt
130
+
131
+ import seaborn as sns
132
+
133
+ import pandas as pd
134
+
135
+ import pandas_profiling as pdp
136
+
137
+ import numpy as np
138
+
139
+
140
+
141
+ import lightgbm as lgb
142
+
143
+ from sklearn.metrics import log_loss
144
+
145
+
146
+
147
+ import datetime
148
+
149
+ import logging
150
+
151
+ import sys, os
152
+
153
+ sys.path.append('../src/')
154
+
155
+
156
+
157
+ import eda
158
+
159
+ import maprepro as mpre
160
+
161
+ # import config
162
+
163
+ # from utils import setup_logger, ModelFactory
164
+
165
+ path = '../../input'
166
+
167
+ sample = pd.read_csv(f'{path}/sample_submission.csv')
168
+
169
+ store = pd.read_csv(f'{path}/store.csv')
170
+
171
+ test = pd.read_csv(f'{path}/test.csv')
172
+
173
+ train = pd.read_csv(f'{path}/train.csv')
174
+
175
+
176
+
177
+ def mk_ymd(df):
178
+
179
+ df['year'] = df.Date.apply(lambda x: x.split('-')[0]).astype(np.int16)
180
+
181
+ df['month'] = df.Date.apply(lambda x: x.split('-')[1]).astype(np.int16)
182
+
183
+ df['day'] = df.Date.apply(lambda x: x.split('-')[2]).astype(np.int16)
184
+
185
+ df = df.sort_values('Date')
186
+
187
+ return df
188
+
189
+ train = mk_ymd(train)
190
+
191
+ test = mk_ymd(test)
192
+
193
+
194
+
195
+
196
+
197
+ # 時系列データであり、時間に沿って変数periodを設定したとする
198
+
199
+ train['period'] = np.arange(0, len(train)) // (len(train) // 4)
200
+
201
+ train['period'] = np.clip(train['period'], 0, 3)
202
+
203
+ test['period'] = 4
204
+
205
+
206
+
207
+ train['StateHoliday'] = train.StateHoliday.astype('category')
208
+
209
+
210
+
211
+ target = ['Sales']
212
+
213
+ notuse = ['Id','Date','Open']
214
+
215
+ use = ['Store','DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday','year','month','day','period']
216
+
217
+
218
+
219
+ train_y = train[target]
220
+
221
+ train_x = train[use]
222
+
223
+ test_x = test[use]
224
+
225
+
226
+
227
+
228
+
229
+ import warnings
230
+
231
+ warnings.simplefilter('ignore')
232
+
233
+ train_x = eda.reduce_mem_usage(train_x)
234
+
235
+ >>>start size(BEFORE): 57.24 Mb
236
+
237
+ >>>Mem. usage decreased to 19.40 Mb (AFTER:66.1% reduction)
238
+
239
+
240
+
241
+ import gc
242
+
243
+ gc.collect()
244
+
245
+ va_period_list = [1, 2, 3]
246
+
247
+ for va_period in va_period_list:
248
+
249
+ is_tr = train_x['period'] < va_period
250
+
251
+ is_va = train_x['period'] == va_period
252
+
253
+ tr_x, va_x = train_x[is_tr], train_x[is_va]
254
+
255
+ tr_y, va_y = train_y[is_tr], train_y[is_va]
256
+
257
+ print(tr_x.shape, va_x.shape)
258
+
259
+ print(tr_y.shape, va_y.shape)
260
+
261
+
262
+
263
+ lgb_train = lgb.Dataset(tr_x, tr_y)
264
+
265
+ lgb_eval = lgb.Dataset(va_x, va_y)
266
+
267
+
268
+
269
+ # ハイパーパラメータの設定
270
+
271
+ params = {'objective': 'binary',
272
+
273
+ 'seed': 71,
274
+
275
+ 'verbose': 1,
276
+
277
+ 'metrics': 'binary_logloss',
278
+
279
+ 'force_col_wise':'true' # メモリが足りないから
280
+
281
+ }
282
+
283
+ num_round = 10
284
+
285
+
286
+
287
+ # 学習の実行
288
+
289
+ # カテゴリ変数をパラメータで指定している
290
+
291
+ # バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
292
+
293
+ categorical_features = ['StateHoliday']
294
+
295
+ model = lgb.train(params, lgb_train, num_boost_round=num_round,
296
+
297
+ categorical_feature=categorical_features,
298
+
299
+ valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval],
300
+
301
+ )
302
+
303
+
304
+
305
+ # バリデーションデータでのスコアの確認
306
+
307
+ va_pred = model.predict(va_x)
308
+
309
+ score = log_loss(va_y, va_pred)
310
+
311
+ print(f'logloss: {score:.4f}')
312
+
313
+
314
+
315
+ # 予測
316
+
317
+ pred = model.predict(test_x)
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+ ```