回答率: 85.35%

質問するログイン新規登録

トップに関する質問 python 機械学習　データ加工

編集履歴

質問編集履歴

1

class Returnを追加しました

2021/05/16 09:29

投稿

スコア15

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -326,4 +326,152 @@
 ```
+```python
+class Return:
+    def __init__(self, return_tables):
+        self.return_tables = return_tables
+    @classmethod
+    def read_pickle(cls, path_list):
+        df = pd.read_pickle(path_list[0])
+        for path in path_list[1:]:
-![イメージ説明](ba4a18f168fd52c7b8dff46f2e72eb4a.png)
+            df = update_data(df, pd.read_pickle(path))
+        return cls(df)
+    @staticmethod
+    def scrape(race_id_list):
+        return_tables = {}
+        for race_id in tqdm(race_id_list):
+            try:
+                url = "https://db.netkeiba.com/race/" + race_id
+                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
+                #そのため、改行コードを文字列brに変換して後でsplitする
+                f = urlopen(url)
+                html = f.read()
+                html = html.replace(b'<br />', b'br')
+                dfs = pd.read_html(html)
+                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
+                df = pd.concat([dfs[1], dfs[2]])
+                df.index = [race_id] * len(df)
+                return_tables[race_id] = df
+                time.sleep(1)
+            except IndexError:
+                continue
+            except Exception as e:
+                print(e)
+                break
+            except:
+                break
+        #pd.DataFrame型にして一つのデータにまとめる
+        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
+        return return_tables_df
+    @property
+    def fukusho(self):
+        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
+        wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
+        wins.columns = ['win_0', 'win_1', 'win_2']
+        returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
+        returns.columns = ['return_0', 'return_1', 'return_2']
+        df = pd.concat([wins, returns], axis=1)
+        for column in df.columns:
+            df[column] = df[column].str.replace(',', '')
+        return df.fillna(0).astype(int)
+    @property
+    def tansho(self):
+        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
+        tansho.columns = ['win', 'return']
+        for column in tansho.columns:
+            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
+        return tansho
+    @property
+    def umaren(self):
+        umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
+        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
+        return_ = umaren[2].rename('return')
+        df = pd.concat([wins, return_], axis=1)
+        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
+```