本を持っている方があれば嬉しいのですが、機械学習・自然学習による自然言語処理入門の本で第四章の78ページにある、train.pyを用いて日本語を抽出したいのですがそれ以前の問題でモジュールが実行できない段階となっております。
以下のURLが自分が使っている本です。
次に、レビューを抽出するデータが以下となります。これを使って実際にデータを操作していきます。
https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz
#pythonのバージョン
Python 3.8.5 (tags/v3.8.5:580fbb0, Jul 20 2020, 15:57:54) [MSC v.1924 64 bit (AMD64)] on win32
#scikit-learnのバージョン
scikit-learn 0.23.2
#_init_pyのコード
_init_pyに原因があるらしく実際に開いたところ、79行目のfrom. import__check_buildのところにエラーがあるらしいそうです。
python
1""" 2Machine learning module for Python 3================================== 4 5sklearn is a Python module integrating classical machine 6learning algorithms in the tightly-knit world of scientific Python 7packages (numpy, scipy, matplotlib). 8 9It aims to provide simple and efficient solutions to learning problems 10that are accessible to everybody and reusable in various contexts: 11machine-learning as a versatile tool for science and engineering. 12 13See http://scikit-learn.org for complete documentation. 14""" 15import sys 16import logging 17import os 18 19from ._config import get_config, set_config, config_context 20 21logger = logging.getLogger(__name__) 22 23 24# PEP0440 compatible formatted version, see: 25# https://www.python.org/dev/peps/pep-0440/ 26# 27# Generic release markers: 28# X.Y 29# X.Y.Z # For bugfix releases 30# 31# Admissible pre-release markers: 32# X.YaN # Alpha release 33# X.YbN # Beta release 34# X.YrcN # Release Candidate 35# X.Y # Final release 36# 37# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. 38# 'X.Y.dev0' is the canonical version of 'X.Y.dev' 39# 40__version__ = '0.23.2' 41 42 43# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded 44# simultaneously. This can happen for instance when calling BLAS inside a 45# prange. Setting the following environment variable allows multiple OpenMP 46# libraries to be loaded. It should not degrade performances since we manually 47# take care of potential over-subcription performance issues, in sections of 48# the code where nested OpenMP loops can happen, by dynamically reconfiguring 49# the inner OpenMP runtime to temporarily disable it while under the scope of 50# the outer OpenMP parallel section. 51os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True") 52 53# Workaround issue discovered in intel-openmp 2019.5: 54# https://github.com/ContinuumIO/anaconda-issues/issues/11294 55os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE") 56 57try: 58 # This variable is injected in the __builtins__ by the build 59 # process. It is used to enable importing subpackages of sklearn when 60 # the binaries are not built 61 # mypy error: Cannot determine type of '__SKLEARN_SETUP__' 62 __SKLEARN_SETUP__ # type: ignore 63except NameError: 64 __SKLEARN_SETUP__ = False 65 66if __SKLEARN_SETUP__: 67 sys.stderr.write('Partial import of sklearn during the build process.\n') 68 # We are not importing the rest of scikit-learn during the build 69 # process, as it may not be compiled yet 70else: 71 # `_distributor_init` allows distributors to run custom init code. 72 # For instance, for the Windows wheel, this is used to pre-load the 73 # vcomp shared library runtime for OpenMP embedded in the sklearn/.libs 74 # sub-folder. 75 # It is necessary to do this prior to importing show_versions as the 76 # later is linked to the OpenMP runtime to make it possible to introspect 77 # it and importing it first would fail if the OpenMP dll cannot be found. 78 from . import _distributor_init # noqa: F401 79 from . import __check_build # noqa: F401 80 from .base import clone 81 from .utils._show_versions import show_versions 82 83 __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition', 84 'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions', 85 'experimental', 'externals', 'feature_extraction', 86 'feature_selection', 'gaussian_process', 'inspection', 87 'isotonic', 'kernel_approximation', 'kernel_ridge', 88 'linear_model', 'manifold', 'metrics', 'mixture', 89 'model_selection', 'multiclass', 'multioutput', 90 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', 91 'preprocessing', 'random_projection', 'semi_supervised', 92 'svm', 'tree', 'discriminant_analysis', 'impute', 'compose', 93 # Non-modules: 94 'clone', 'get_config', 'set_config', 'config_context', 95 'show_versions'] 96 97 98def setup_module(module): 99 """Fixture for the tests to assure globally controllable seeding of RNGs""" 100 import os 101 import numpy as np 102 import random 103 104 # Check if a random seed exists in the environment, if not create one. 105 _random_seed = os.environ.get('SKLEARN_SEED', None) 106 if _random_seed is None: 107 _random_seed = np.random.uniform() * np.iinfo(np.int32).max 108 _random_seed = int(_random_seed) 109 print("I: Seeding RNGs with %r" % _random_seed) 110 np.random.seed(_random_seed) 111 random.seed(_random_seed) 112 113 114
このコードを用いて、train.pyを用いたところエラー?が出ました。train.pyを実行するためのコードです。
#preprocessing.py
python
1""" 2Preprocessings. 3""" 4import re 5 6from bs4 import BeautifulSoup 7from janome.tokenizer import Tokenizer 8t = Tokenizer() 9 10 11def clean_html(html, strip=False): 12 soup = BeautifulSoup(html, 'html.parser') 13 text = soup.get_text(strip=strip) 14 return text 15 16 17def tokenize(text): 18 return t.tokenize(text, wakati=True) 19 20 21def tokenize_base_form(text): 22 tokens = [token.base_form for token in t.tokenize(text)] 23 return tokens 24 25 26def normalize_number(text, reduce=False): 27 if reduce: 28 normalized_text = re.sub(r'\d+', '0', text) 29 else: 30 normalized_text = re.sub(r'\d', '0', text) 31 return normalized_text 32 33 34def truncate(sequence, maxlen): 35 return sequence[:maxlen] 36 37 38def remove_url(html): 39 soup = BeautifulSoup(html, 'html.parser') 40 for a in soup.findAll('a'): 41 a.replaceWithChildren() 42 return str(soup) 43 44
#utilis.py
python
1import string 2import pandas as pd 3from sklearn.feature_extraction.text import CountVectorizer 4from sklearn.linear_model import LogisticRegression 5from sklearn.metrics import accuracy_score 6 7 8def filter_by_ascii_rate(text, threshold=0.9): 9 ascii_letters = set(string.printable) 10 rate = sum(c in ascii_letters for c in text) / len(text) 11 return rate <= threshold 12 13 14def load_dataset(filename, n=5000, state=6): 15 df = pd.read_csv(filename, sep='\t') 16 17 # extracts Japanese texts. 18 is_jp = df.review_body.apply(filter_by_ascii_rate) 19 df = df[is_jp] 20 21 # sampling. 22 df = df.sample(frac=1, random_state=state) # shuffle 23 grouped = df.groupby('star_rating') 24 df = grouped.head(n=n) 25 return df.review_body.values, df.star_rating.values 26 27 28def train_and_eval(x_train, y_train, x_test, y_test, 29 lowercase=False, tokenize=None, preprocessor=None): 30 vectorizer = CountVectorizer(lowercase=lowercase, 31 tokenizer=tokenize, 32 preprocessor=preprocessor) 33 x_train_vec = vectorizer.fit_transform(x_train) 34 x_test_vec = vectorizer.transform(x_test) 35 clf = LogisticRegression(solver='liblinear') 36 clf.fit(x_train_vec, y_train) 37 y_pred = clf.predict(x_test_vec) 38 score = accuracy_score(y_test, y_pred) 39 print('{:.4f}'.format(score)) 40 41
#train.py
python
1from sklearn.model_selection import train_test_split 2 3from preprocessing import clean_html, normalize_number, tokenize, tokenize_base_form 4from utils import load_dataset, train_and_eval 5 6 7def main(): 8 x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv', n=1000) 9 10 x_train, x_test, y_train, y_test = train_test_split(x, y, 11 test_size=0.2, 12 random_state=42) 13 14 print('Tokenization only.') 15 train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize) 16 17 print('Clean html.') 18 train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=clean_html) 19 20 print('Normalize number.') 21 train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=normalize_number) 22 23 print('Base form.') 24 train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize_base_form) 25 26 print('Lower text.') 27 train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, lowercase=True) 28 29 30if __name__ == '__main__': 31 main() 32 33
#エラー?らしきもの
python train.pyを実行してみたところ、このようなエラーが出てしまいました。
回答1件
あなたの回答
tips
プレビュー
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。