encodeErroerについて

python
1
2import re
3
4def split_by_punct(segment):
5    """Splits str segment by punctuation, filters our empties and spaces."""
6    return [s for s in re.split(r'\W+', segment) if s and not s.isspace()]
7
8def read_text(filename):
9    with open(filename) as f:
10        sentences = []
11        for line in f:
12            line = line.strip()
13            words = ' '.join(split_by_punct(line)).strip()
14            sentences.append(words)
15        return ' '.join(sentences).strip()
16
17
18def load_file(data_file, split_idx):
19    train = []
20    dev = []
21    with open(data_file) as f:
22        for filename in f:
23            idx = int(filename.split('/')[-1].split('_')[0])
24            words = read_text(filename.strip())
25            if idx >= split_idx:
26                dev.append(words)
27            else:
28                train.append(words)
29    return train, dev
30
31
32def load_test_dataset(data_file):
33    test = []
34    with open(data_file) as f:
35        for filename in f:
36            words = read_text(filename.strip())
37            test.append(words)
38    return test
39
40def prepare_imdb():
41    # this split is used at
42    # https://github.com/tensorflow/models/tree/master/research/adversarial_text
43    imdb_validation_pos_start_id = 10621  # total size: 12499
44    imdb_validation_neg_start_id = 10625
45
46    def fwrite_data(filename, sentences):
47        with open(filename, 'w') as f:
48            for words in sentences:
49                # line = ' '.join(words)
50                line = words
51                f.write(line.strip() + '\n')
52            f.close()
53
54    pos_train, pos_dev = load_file('imdb_train_pos_list.txt',
55                                   imdb_validation_pos_start_id)
56    neg_train, neg_dev = load_file('imdb_train_neg_list.txt',
57                                   imdb_validation_neg_start_id)
58
59    pos_test = load_test_dataset('imdb_test_pos_list.txt')
60    neg_test = load_test_dataset('imdb_test_neg_list.txt')
61
62    fwrite_data('imdb_pos_train.txt', pos_train)
63    fwrite_data('imdb_pos_dev.txt', pos_dev)
64    fwrite_data('imdb_neg_train.txt', neg_train)
65    fwrite_data('imdb_neg_dev.txt', neg_dev)
66
67    fwrite_data('imdb_pos_test.txt', pos_test)
68    fwrite_data('imdb_neg_test.txt', neg_test)
69
70    unlabled_lm_train, _ = load_file('imdb_unlabled_list.txt', 100000000)
71
72    fwrite_data('imdb_unlabled.txt', unlabled_lm_train)
73    print('Done')
74
75if __name__ == '__main__':
76    import sys
77    action = sys.argv[1]
78    if action == 'prepare_imdb':
79        prepare_imdb()

https://github.com/aonotas/adversarial_text

上記githubにあることをやろうとしています。

該当コードを実行したところ、
実行ファイル ./download.sh Prepare for IMDB Prepare script is running... Traceback (most recent call last): File "preprocess.py", line 79, in <module> prepare_imdb() File "preprocess.py", line 55, in prepare_imdb imdb_validation_pos_start_id) File "preprocess.py", line 24, in load_file words = read_text(filename.strip()) File "preprocess.py", line 11, in read_text for line in f: File "/Users/hogehoge/anaconda3/lib/python3.6/encodings/ascii.py", line 26, in decode return codecs.ascii_decode(input, self.errors)[0] UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 399: ordinal not in range(128)
がでました

環境は
OS Mac Mojave Python 3.6.5 :: Anaconda, Inc.
です

どのように修正すれば実行されるでしょうか

quickquip

2019/02/11 23:17

OS,Pythonはなにを使っているか、エラーが出たのはどこかを書く。トレースバックを省略しない。そもそもあなたかなにを実行したのかすら、他の人にはわからない。

yukit5669

2019/02/12 03:30 編集

わかりにくい記載をして申し訳ありませんでした OS Mac Mojave Python 3.6.5 :: Anaconda, Inc. ```実行ファイル ./download.sh Prepare for IMDB Prepare script is running... Traceback (most recent call last): File "preprocess.py", line 79, in <module> prepare_imdb() File "preprocess.py", line 55, in prepare_imdb imdb_validation_pos_start_id) File "preprocess.py", line 24, in load_file words = read_text(filename.strip()) File "preprocess.py", line 11, in read_text for line in f: File "/Users/hogehoge/anaconda3/lib/python3.6/encodings/ascii.py", line 26, in decode return codecs.ascii_decode(input, self.errors)[0] UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 399: ordinal not in range(128)```