from sklearn.model_selection import train_test

本を持っている方があれば嬉しいのですが、機械学習・自然学習による自然言語処理入門の本で第四章の78ページにある、train.pyを用いて日本語を抽出したいのですがそれ以前の問題でモジュールが実行できない段階となっております。
以下のURLが自分が使っている本です。

https://www.amazon.co.jp/%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%83%BB%E6%B7%B1%E5%B1%A4%E5%AD%A6%E7%BF%92%E3%81%AB%E3%82%88%E3%82%8B%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86%E5%85%A5%E9%96%80-Compass-Books%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA-%E4%B8%AD%E5%B1%B1-%E5%85%89%E6%A8%B9-ebook/dp/B084WPRT44/ref=sr_1_1?__mk_ja_JP=%E3%82%AB%E3%82%BF%E3%82%AB%E3%83%8A&crid=1O57YT9TDXZJ&dchild=1&keywords=%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92+%E6%B7%B1%E5%B1%A4%E5%AD%A6%E7%BF%92%E3%81%AB%E3%82%88%E3%82%8B%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86%E5%85%A5%E9%96%80&qid=1601255981&s=digital-text&sprefix=%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%83%BB%E6%B7%B1%E5%B1%A4%E5%AD%A6%E7%BF%92%2Cdigital-text%2C267&sr=1-1

次に、レビューを抽出するデータが以下となります。これを使って実際にデータを操作していきます。
https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz

#pythonのバージョン
Python 3.8.5 (tags/v3.8.5:580fbb0, Jul 20 2020, 15:57:54) [MSC v.1924 64 bit (AMD64)] on win32
#scikit-learnのバージョン
scikit-learn 0.23.2
#_init_pyのコード

_init_pyに原因があるらしく実際に開いたところ、79行目のfrom. import__check_buildのところにエラーがあるらしいそうです。

python
1"""
2Machine learning module for Python
3==================================
4
5sklearn is a Python module integrating classical machine
6learning algorithms in the tightly-knit world of scientific Python
7packages (numpy, scipy, matplotlib).
8
9It aims to provide simple and efficient solutions to learning problems
10that are accessible to everybody and reusable in various contexts:
11machine-learning as a versatile tool for science and engineering.
12
13See http://scikit-learn.org for complete documentation.
14"""
15import sys
16import logging
17import os
18
19from ._config import get_config, set_config, config_context
20
21logger = logging.getLogger(__name__)
22
23
24# PEP0440 compatible formatted version, see:
25# https://www.python.org/dev/peps/pep-0440/
26#
27# Generic release markers:
28#   X.Y
29#   X.Y.Z   # For bugfix releases
30#
31# Admissible pre-release markers:
32#   X.YaN   # Alpha release
33#   X.YbN   # Beta release
34#   X.YrcN  # Release Candidate
35#   X.Y     # Final release
36#
37# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
38# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
39#
40__version__ = '0.23.2'
41
42
43# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
44# simultaneously. This can happen for instance when calling BLAS inside a
45# prange. Setting the following environment variable allows multiple OpenMP
46# libraries to be loaded. It should not degrade performances since we manually
47# take care of potential over-subcription performance issues, in sections of
48# the code where nested OpenMP loops can happen, by dynamically reconfiguring
49# the inner OpenMP runtime to temporarily disable it while under the scope of
50# the outer OpenMP parallel section.
51os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
52
53# Workaround issue discovered in intel-openmp 2019.5:
54# https://github.com/ContinuumIO/anaconda-issues/issues/11294
55os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
56
57try:
58    # This variable is injected in the __builtins__ by the build
59    # process. It is used to enable importing subpackages of sklearn when
60    # the binaries are not built
61    # mypy error: Cannot determine type of '__SKLEARN_SETUP__'
62    __SKLEARN_SETUP__  # type: ignore
63except NameError:
64    __SKLEARN_SETUP__ = False
65
66if __SKLEARN_SETUP__:
67    sys.stderr.write('Partial import of sklearn during the build process.\n')
68    # We are not importing the rest of scikit-learn during the build
69    # process, as it may not be compiled yet
70else:
71    # `_distributor_init` allows distributors to run custom init code.
72    # For instance, for the Windows wheel, this is used to pre-load the
73    # vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
74    # sub-folder.
75    # It is necessary to do this prior to importing show_versions as the
76    # later is linked to the OpenMP runtime to make it possible to introspect
77    # it and importing it first would fail if the OpenMP dll cannot be found.
78    from . import _distributor_init  # noqa: F401
79    from . import __check_build  # noqa: F401
80    from .base import clone
81    from .utils._show_versions import show_versions
82
83    __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
84               'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
85               'experimental', 'externals', 'feature_extraction',
86               'feature_selection', 'gaussian_process', 'inspection',
87               'isotonic', 'kernel_approximation', 'kernel_ridge',
88               'linear_model', 'manifold', 'metrics', 'mixture',
89               'model_selection', 'multiclass', 'multioutput',
90               'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
91               'preprocessing', 'random_projection', 'semi_supervised',
92               'svm', 'tree', 'discriminant_analysis', 'impute', 'compose',
93               # Non-modules:
94               'clone', 'get_config', 'set_config', 'config_context',
95               'show_versions']
96
97
98def setup_module(module):
99    """Fixture for the tests to assure globally controllable seeding of RNGs"""
100    import os
101    import numpy as np
102    import random
103
104    # Check if a random seed exists in the environment, if not create one.
105    _random_seed = os.environ.get('SKLEARN_SEED', None)
106    if _random_seed is None:
107        _random_seed = np.random.uniform() * np.iinfo(np.int32).max
108    _random_seed = int(_random_seed)
109    print("I: Seeding RNGs with %r" % _random_seed)
110    np.random.seed(_random_seed)
111    random.seed(_random_seed)
112
113
114

このコードを用いて、train.pyを用いたところエラー?が出ました。train.pyを実行するためのコードです。
#preprocessing.py

python
1"""
2Preprocessings.
3"""
4import re
5
6from bs4 import BeautifulSoup
7from janome.tokenizer import Tokenizer
8t = Tokenizer()
9
10
11def clean_html(html, strip=False):
12    soup = BeautifulSoup(html, 'html.parser')
13    text = soup.get_text(strip=strip)
14    return text
15
16
17def tokenize(text):
18    return t.tokenize(text, wakati=True)
19
20
21def tokenize_base_form(text):
22    tokens = [token.base_form for token in t.tokenize(text)]
23    return tokens
24
25
26def normalize_number(text, reduce=False):
27    if reduce:
28        normalized_text = re.sub(r'\d+', '0', text)
29    else:
30        normalized_text = re.sub(r'\d', '0', text)
31    return normalized_text
32
33
34def truncate(sequence, maxlen):
35    return sequence[:maxlen]
36
37
38def remove_url(html):
39    soup = BeautifulSoup(html, 'html.parser')
40    for a in soup.findAll('a'):
41        a.replaceWithChildren()
42    return str(soup)
43
44

#utilis.py

python
1import string
2import pandas as pd
3from sklearn.feature_extraction.text import CountVectorizer
4from sklearn.linear_model import LogisticRegression
5from sklearn.metrics import accuracy_score
6
7
8def filter_by_ascii_rate(text, threshold=0.9):
9    ascii_letters = set(string.printable)
10    rate = sum(c in ascii_letters for c in text) / len(text)
11    return rate <= threshold
12
13
14def load_dataset(filename, n=5000, state=6):
15    df = pd.read_csv(filename, sep='\t')
16
17    # extracts Japanese texts.
18    is_jp = df.review_body.apply(filter_by_ascii_rate)
19    df = df[is_jp]
20
21    # sampling.
22    df = df.sample(frac=1, random_state=state)  # shuffle
23    grouped = df.groupby('star_rating')
24    df = grouped.head(n=n)
25    return df.review_body.values, df.star_rating.values
26
27
28def train_and_eval(x_train, y_train, x_test, y_test,
29                   lowercase=False, tokenize=None, preprocessor=None):
30    vectorizer = CountVectorizer(lowercase=lowercase,
31                                 tokenizer=tokenize,
32                                 preprocessor=preprocessor)
33    x_train_vec = vectorizer.fit_transform(x_train)
34    x_test_vec = vectorizer.transform(x_test)
35    clf = LogisticRegression(solver='liblinear')
36    clf.fit(x_train_vec, y_train)
37    y_pred = clf.predict(x_test_vec)
38    score = accuracy_score(y_test, y_pred)
39    print('{:.4f}'.format(score))
40
41

#train.py

python
1from sklearn.model_selection import train_test_split
2
3from preprocessing import clean_html, normalize_number, tokenize, tokenize_base_form
4from utils import load_dataset, train_and_eval
5
6
7def main():
8    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv', n=1000)
9
10    x_train, x_test, y_train, y_test = train_test_split(x, y,
11                                                        test_size=0.2,
12                                                        random_state=42)
13
14    print('Tokenization only.')
15    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize)
16
17    print('Clean html.')
18    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=clean_html)
19
20    print('Normalize number.')
21    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=normalize_number)
22
23    print('Base form.')
24    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize_base_form)
25
26    print('Lower text.')
27    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, lowercase=True)
28
29
30if __name__ == '__main__':
31    main()
32
33