python 機械学習　データ加工

いつもお世話になっています。地方競馬バージョンで作成しているのですが、以下のようなエラーが出てしまい対処することができていません。
・試したこと
return_tablesにおいて中止されたっであろうレースの払い戻しが　特　と表記されていたので、
return_tables.dropna(how = "any")を行ったが改善されなかった。
ご教授よろしくお願いいいたします。

python
1#ModelEvaluatorクラスのオブジェクトを作成
2me = ModelEvaluator(lgb_clf, 'return_tables.pickle')

python
1---------------------------------------------------------------------------
2ValueError                                Traceback (most recent call last)
3<ipython-input-29-94c866045c8a> in <module>
4      1 #ModelEvaluatorクラスのオブジェクトを作成
5----> 2 me = ModelEvaluator(lgb_clf, 'return_tables.pickle')
6
7<ipython-input-27-b5736235fdb3> in __init__(self, model, return_tables_path, std)
8      3         self.model = model
9      4         self.rt = Return.read_pickle([return_tables_path])
10----> 5         self.fukusho = self.rt.fukusho
11      6         self.tansho = self.rt.tansho
12      7         self.umaren = self.rt.umaren
13
14<ipython-input-23-4e09eba6addc> in fukusho(self)
15     65         for column in df.columns:
16     66             df[column] = df[column].str.replace(',', '')
17---> 67         return df.fillna(0).astype(int)
18     68 
19     69     @property
20
21/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
22   5544         else:
23   5545             # else, only a single dtype is given
24-> 5546             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
25   5547             return self._constructor(new_data).__finalize__(self, method="astype")
26   5548 
27
28/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
29    593         self, dtype, copy: bool = False, errors: str = "raise"
30    594     ) -> "BlockManager":
31--> 595         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
32    596 
33    597     def convert(
34
35/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
36    404                 applied = b.apply(f, **kwargs)
37    405             else:
38--> 406                 applied = getattr(b, f)(**kwargs)
39    407             result_blocks = _extend_blocks(applied, result_blocks)
40    408 
41
42/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
43    593             vals1d = values.ravel()
44    594             try:
45--> 595                 values = astype_nansafe(vals1d, dtype, copy=True)
46    596             except (ValueError, TypeError):
47    597                 # e.g. astype_nansafe can fail on object-dtype of strings
48
49/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
50    970         # work around NumPy brokenness, #1987
51    971         if np.issubdtype(dtype.type, np.integer):
52--> 972             return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
53    973 
54    974         # if we have a datetime/timedelta array of objects
55
56pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()
57
58ValueError: invalid literal for int() with base 10: '特'

python
1class ModelEvaluator:
2    def __init__(self, model, return_tables_path, std=True):
3        self.model = model
4        self.rt = Return.read_pickle([return_tables_path])
5        self.fukusho = self.rt.fukusho
6        self.tansho = self.rt.tansho
7        self.umaren = self.rt.umaren
8        self.std = std
9    
10    #3着以内に入る確率を予測。出馬表のデータを入れるときはdrop_tansho=Falseにする。
11    def predict_proba(self, X, drop_tansho=True):
12        if drop_tansho:
13            proba = pd.Series(self.model.predict_proba(X.drop(['単勝'], axis=1))[:, 1], index=X.index)
14        else:
15            proba = pd.Series(self.model.predict_proba(X)[:, 1], index=X.index)
16        if self.std:
17            standard_scaler = lambda x: (x - x.mean()) / x.std()
18            proba = proba.groupby(level=0).transform(standard_scaler)
19            proba = (proba - proba.min()) / (proba.max() - proba.min())
20        return proba
21    
22    #0か1かを予測
23    def predict(self, X, threshold=0.5):
24        y_pred = self.predict_proba(X)
25        return [0 if p<threshold else 1 for p in y_pred]
26    
27    def score(self, y_true, X):
28        return roc_auc_score(y_true, self.predict_proba(X))
29    
30    def feature_importance(self, X, n_display=20):
31        importances = pd.DataFrame({"features": X.columns, 
32                                    "importance": self.model.feature_importances_})
33        return importances.sort_values("importance", ascending=False)[:n_display]
34    
35    def pred_table(self, X, threshold=0.5, bet_only=True):
36        pred_table = X.copy()[['馬番', '単勝']]
37        pred_table['pred'] = self.predict(X, threshold)
38        if bet_only:
39            return pred_table[pred_table['pred']==1][['馬番', '単勝']]
40        else:
41            return pred_table
42        
43    def fukusho_return(self, X, threshold=0.5):
44        pred_table = self.pred_table(X, threshold)
45        n_bets = len(pred_table)
46        money = -100 * n_bets
47        df = self.fukusho.copy()
48        df = df.merge(pred_table, left_index=True, right_index=True, how='right')
49        for i in range(3):
50            money += df[df['win_{}'.format(i)]==df['馬番']]['return_{}'.format(i)].sum()
51        return_rate = (n_bets*100 + money) / (n_bets*100)
52        return n_bets, return_rate
53    
54    def tansho_return(self, X, threshold=0.5):
55        pred_table = self.pred_table(X, threshold)
56        n_bets = len(pred_table)
57        money = -100 * n_bets
58        df = self.tansho.copy()
59        df = df.merge(pred_table, left_index=True, right_index=True, how='right')
60        n_hits = len(df[df['win']==df['馬番']])
61        money += df[df['win']==df['馬番']]['return'].sum()
62        return_rate = (n_bets*100 + money) / (n_bets*100)
63        return n_bets, return_rate, n_hits
64    
65    def tansho_return_proper(self, X, threshold=0.5):
66        #モデルによって「賭ける」と判断された馬たち
67        pred_table = self.pred_table(X, threshold)
68        n_bets = len(pred_table)
69        
70        #払い戻し表にpred_tableをマージ
71        df = self.tansho.copy()
72        df = df.merge(pred_table, left_index=True, right_index=True, how='right')
73        
74        #単勝適正回収値を計算
75        n_hits = len(df.query('win == 馬番'))
76        return_rate = n_hits / (1 / pred_table['単勝']).sum()
77        
78        return n_bets, return_rate, n_hits
79    
80    def umaren_return(self, X, threshold=0.5):
81        pred_table = self.pred_table(X, threshold)
82        hit = {}
83        n_bets = 0
84        for race_id, preds in pred_table.groupby(level=0):
85            n_bets += comb(len(preds), 2)
86            hit[race_id] = set(self.umaren.loc[race_id][['win_0', 'win_1']])\
87                .issubset(set(preds))
88        return_rate = (self.umaren.index.map(hit).values * self.umaren['return']).sum() \
89            / (n_bets * 100)
90        return n_bets, return_rate

python
1class Return:
2    def __init__(self, return_tables):
3        self.return_tables = return_tables
4    
5    @classmethod
6    def read_pickle(cls, path_list):
7        df = pd.read_pickle(path_list[0])
8        for path in path_list[1:]:
9            df = update_data(df, pd.read_pickle(path))
10        return cls(df)
11    
12    @staticmethod
13    def scrape(race_id_list):
14        return_tables = {}
15        for race_id in tqdm(race_id_list):
16            try:
17                url = "https://db.netkeiba.com/race/" + race_id
18
19                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
20                #そのため、改行コードを文字列brに変換して後でsplitする
21                f = urlopen(url)
22                html = f.read()
23                html = html.replace(b'<br />', b'br')
24                dfs = pd.read_html(html)
25
26                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
27                df = pd.concat([dfs[1], dfs[2]])
28
29                df.index = [race_id] * len(df)
30                return_tables[race_id] = df
31                time.sleep(1)
32            except IndexError:
33                continue
34            except Exception as e:
35                print(e)
36                break
37            except:
38                break
39
40        #pd.DataFrame型にして一つのデータにまとめる
41        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
42        return return_tables_df
43    
44    @property
45    def fukusho(self):
46        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
47        wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
48        wins.columns = ['win_0', 'win_1', 'win_2']
49        returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
50        returns.columns = ['return_0', 'return_1', 'return_2']
51        
52        df = pd.concat([wins, returns], axis=1)
53        for column in df.columns:
54            df[column] = df[column].str.replace(',', '')
55        return df.fillna(0).astype(int)
56    
57    @property
58    def tansho(self):
59        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
60        tansho.columns = ['win', 'return']
61        
62        for column in tansho.columns:
63            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
64            
65        return tansho
66    
67    @property
68    def umaren(self):
69        umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
70        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
71        return_ = umaren[2].rename('return')  
72        df = pd.concat([wins, return_], axis=1)        
73        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))

退会済みユーザー

2021/05/16 09:03

> 地方競馬バージョンで作成しているのですが何をつくろうとしているのでしょうか？○○を使って○○なソフトを地方競馬バージョンで作ろうとしている、という本当に伝えたいことが隠れてしまっています。エラー自体は恐らくint(数字)にしたいの特(文字列)が来てしまってどうしたものか、とPythonが困っていることが原因だと思います。可能な範囲で関係しそうなコードとサンプルコードを開示ください。