python
1 2import re 3 4def split_by_punct(segment): 5 """Splits str segment by punctuation, filters our empties and spaces.""" 6 return [s for s in re.split(r'\W+', segment) if s and not s.isspace()] 7 8def read_text(filename): 9 with open(filename) as f: 10 sentences = [] 11 for line in f: 12 line = line.strip() 13 words = ' '.join(split_by_punct(line)).strip() 14 sentences.append(words) 15 return ' '.join(sentences).strip() 16 17 18def load_file(data_file, split_idx): 19 train = [] 20 dev = [] 21 with open(data_file) as f: 22 for filename in f: 23 idx = int(filename.split('/')[-1].split('_')[0]) 24 words = read_text(filename.strip()) 25 if idx >= split_idx: 26 dev.append(words) 27 else: 28 train.append(words) 29 return train, dev 30 31 32def load_test_dataset(data_file): 33 test = [] 34 with open(data_file) as f: 35 for filename in f: 36 words = read_text(filename.strip()) 37 test.append(words) 38 return test 39 40def prepare_imdb(): 41 # this split is used at 42 # https://github.com/tensorflow/models/tree/master/research/adversarial_text 43 imdb_validation_pos_start_id = 10621 # total size: 12499 44 imdb_validation_neg_start_id = 10625 45 46 def fwrite_data(filename, sentences): 47 with open(filename, 'w') as f: 48 for words in sentences: 49 # line = ' '.join(words) 50 line = words 51 f.write(line.strip() + '\n') 52 f.close() 53 54 pos_train, pos_dev = load_file('imdb_train_pos_list.txt', 55 imdb_validation_pos_start_id) 56 neg_train, neg_dev = load_file('imdb_train_neg_list.txt', 57 imdb_validation_neg_start_id) 58 59 pos_test = load_test_dataset('imdb_test_pos_list.txt') 60 neg_test = load_test_dataset('imdb_test_neg_list.txt') 61 62 fwrite_data('imdb_pos_train.txt', pos_train) 63 fwrite_data('imdb_pos_dev.txt', pos_dev) 64 fwrite_data('imdb_neg_train.txt', neg_train) 65 fwrite_data('imdb_neg_dev.txt', neg_dev) 66 67 fwrite_data('imdb_pos_test.txt', pos_test) 68 fwrite_data('imdb_neg_test.txt', neg_test) 69 70 unlabled_lm_train, _ = load_file('imdb_unlabled_list.txt', 100000000) 71 72 fwrite_data('imdb_unlabled.txt', unlabled_lm_train) 73 print('Done') 74 75if __name__ == '__main__': 76 import sys 77 action = sys.argv[1] 78 if action == 'prepare_imdb': 79 prepare_imdb()
https://github.com/aonotas/adversarial_text
上記githubにあることをやろうとしています。
該当コードを実行したところ、
実行ファイル ./download.sh Prepare for IMDB Prepare script is running... Traceback (most recent call last): File "preprocess.py", line 79, in <module> prepare_imdb() File "preprocess.py", line 55, in prepare_imdb imdb_validation_pos_start_id) File "preprocess.py", line 24, in load_file words = read_text(filename.strip()) File "preprocess.py", line 11, in read_text for line in f: File "/Users/hogehoge/anaconda3/lib/python3.6/encodings/ascii.py", line 26, in decode return codecs.ascii_decode(input, self.errors)[0] UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 399: ordinal not in range(128)
がでました
環境は
OS Mac Mojave Python 3.6.5 :: Anaconda, Inc.
です
どのように修正すれば実行されるでしょうか
回答1件
あなたの回答
tips
プレビュー