質問編集履歴
1
class Returnを追加しました
test
CHANGED
File without changes
|
test
CHANGED
@@ -326,4 +326,152 @@
|
|
326
326
|
|
327
327
|
```
|
328
328
|
|
329
|
+
```python
|
330
|
+
|
331
|
+
class Return:
|
332
|
+
|
333
|
+
def __init__(self, return_tables):
|
334
|
+
|
335
|
+
self.return_tables = return_tables
|
336
|
+
|
337
|
+
|
338
|
+
|
339
|
+
@classmethod
|
340
|
+
|
341
|
+
def read_pickle(cls, path_list):
|
342
|
+
|
343
|
+
df = pd.read_pickle(path_list[0])
|
344
|
+
|
345
|
+
for path in path_list[1:]:
|
346
|
+
|
329
|
-
|
347
|
+
df = update_data(df, pd.read_pickle(path))
|
348
|
+
|
349
|
+
return cls(df)
|
350
|
+
|
351
|
+
|
352
|
+
|
353
|
+
@staticmethod
|
354
|
+
|
355
|
+
def scrape(race_id_list):
|
356
|
+
|
357
|
+
return_tables = {}
|
358
|
+
|
359
|
+
for race_id in tqdm(race_id_list):
|
360
|
+
|
361
|
+
try:
|
362
|
+
|
363
|
+
url = "https://db.netkeiba.com/race/" + race_id
|
364
|
+
|
365
|
+
|
366
|
+
|
367
|
+
#普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
|
368
|
+
|
369
|
+
#そのため、改行コードを文字列brに変換して後でsplitする
|
370
|
+
|
371
|
+
f = urlopen(url)
|
372
|
+
|
373
|
+
html = f.read()
|
374
|
+
|
375
|
+
html = html.replace(b'<br />', b'br')
|
376
|
+
|
377
|
+
dfs = pd.read_html(html)
|
378
|
+
|
379
|
+
|
380
|
+
|
381
|
+
#dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
|
382
|
+
|
383
|
+
df = pd.concat([dfs[1], dfs[2]])
|
384
|
+
|
385
|
+
|
386
|
+
|
387
|
+
df.index = [race_id] * len(df)
|
388
|
+
|
389
|
+
return_tables[race_id] = df
|
390
|
+
|
391
|
+
time.sleep(1)
|
392
|
+
|
393
|
+
except IndexError:
|
394
|
+
|
395
|
+
continue
|
396
|
+
|
397
|
+
except Exception as e:
|
398
|
+
|
399
|
+
print(e)
|
400
|
+
|
401
|
+
break
|
402
|
+
|
403
|
+
except:
|
404
|
+
|
405
|
+
break
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
#pd.DataFrame型にして一つのデータにまとめる
|
410
|
+
|
411
|
+
return_tables_df = pd.concat([return_tables[key] for key in return_tables])
|
412
|
+
|
413
|
+
return return_tables_df
|
414
|
+
|
415
|
+
|
416
|
+
|
417
|
+
@property
|
418
|
+
|
419
|
+
def fukusho(self):
|
420
|
+
|
421
|
+
fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
|
422
|
+
|
423
|
+
wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
|
424
|
+
|
425
|
+
wins.columns = ['win_0', 'win_1', 'win_2']
|
426
|
+
|
427
|
+
returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
|
428
|
+
|
429
|
+
returns.columns = ['return_0', 'return_1', 'return_2']
|
430
|
+
|
431
|
+
|
432
|
+
|
433
|
+
df = pd.concat([wins, returns], axis=1)
|
434
|
+
|
435
|
+
for column in df.columns:
|
436
|
+
|
437
|
+
df[column] = df[column].str.replace(',', '')
|
438
|
+
|
439
|
+
return df.fillna(0).astype(int)
|
440
|
+
|
441
|
+
|
442
|
+
|
443
|
+
@property
|
444
|
+
|
445
|
+
def tansho(self):
|
446
|
+
|
447
|
+
tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
|
448
|
+
|
449
|
+
tansho.columns = ['win', 'return']
|
450
|
+
|
451
|
+
|
452
|
+
|
453
|
+
for column in tansho.columns:
|
454
|
+
|
455
|
+
tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
|
456
|
+
|
457
|
+
|
458
|
+
|
459
|
+
return tansho
|
460
|
+
|
461
|
+
|
462
|
+
|
463
|
+
@property
|
464
|
+
|
465
|
+
def umaren(self):
|
466
|
+
|
467
|
+
umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
|
468
|
+
|
469
|
+
wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
|
470
|
+
|
471
|
+
return_ = umaren[2].rename('return')
|
472
|
+
|
473
|
+
df = pd.concat([wins, return_], axis=1)
|
474
|
+
|
475
|
+
return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
|
476
|
+
|
477
|
+
```
|