前提・実現したいこと
ある参考書に載っていた
natto -Mecabを用いたword2vecを実行しようとしましたがこのエラーだけはよくわからなかったので質問させていただきます
UnicodeDecodeError MecabErrorがよくわかりません
発生している問題・エラーメッセージ
UnicodeDecodeError Traceback (most recent call last) ~\Anaconda3\Lib\site-packages\natto\mecab.py in __parse_tonodes(self, text, **kwargs) 396 nptr.surface[0:nptr.length]) --> 397 surf = self.__bytes2str(raws).strip() 398 ~\Anaconda3\Lib\site-packages\natto\support.py in bytes2str(b) 25 '''Transforms bytes into string (Unicode).''' ---> 26 return b.decode(py3enc) 27 def str2bytes(u): UnicodeDecodeError: 'shift_jis' codec can't decode byte 0x81 in position 0: incomplete multibyte sequence During handling of the above exception, another exception occurred: MeCabError Traceback (most recent call last) <ipython-input-14-eb2fa4a116ee> in <module>() 64 65 if __name__ == '__main__': ---> 66 tf.app.run() ~\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\platform\app.py in run(main, argv) 124 # Call the main function, passing through any arguments 125 # to the final program. --> 126 _sys.exit(main(argv)) 127 128 <ipython-input-14-eb2fa4a116ee> in main(argv) 13 14 def main(argv): ---> 15 data=DataSet(FLAGS.data_dir,FLAGS.max_vocab) 16 17 ~\Desktop\TensorFlowDL-samples-master\word2vec\data_set.py in __init__(self, data_dir, max_vocab) 23 documents = [self._preprocessing(document) for document in row_documents] 24 #形態素解析 ---> 25 splited_documents = [self._morphological(document) for document in documents] 26 27 words = [] ~\Desktop\TensorFlowDL-samples-master\word2vec\data_set.py in <listcomp>(.0) 23 documents = [self._preprocessing(document) for document in row_documents] 24 #形態素解析 ---> 25 splited_documents = [self._morphological(document) for document in documents] 26 27 words = [] ~\Desktop\TensorFlowDL-samples-master\word2vec\data_set.py in _morphological(self, document) 78 #MeCabの形態素解析結果のフォーマット 79 with MeCab('-F%f[0],%f[1],%f[6]') as mcb: ---> 80 for token in mcb.parse(document, as_nodes=True): 81 features = token.feature.split(',') 82 #名詞(一般)動詞(自立)、形容詞(自立)以外は除外 ~\Anaconda3\Lib\site-packages\natto\mecab.py in __parse_tonodes(self, text, **kwargs) 425 err = self.__mecab.mecab_lattice_strerror(self.lattice) 426 logger.error(self.__bytes2str(self.__ffi.string(err))) --> 427 raise MeCabError(self.__bytes2str(self.__ffi.string(err))) 428 429 def __repr__(self): def _read_docment(self,file_path): with open(file_path,'r',encoding='sjis')as f : def _morphological(self,document): word_list=[] with MeCab('-F%f[0],%f[1],%f[6]')as mcb: for token in mcb.parse(document,as_nodes=True): features=token.feature.splits(',') if features[0]=='名詞'and features[1]=='一般'and features[2] !='': word_list.append(features[2]) if features[0]=='動詞'and features[1]=='自立'and features[2] !='': word_list.append(features[2]) if features[0]=='形容詞'and features[1]=='自立'and features[2] !='': word_list.append(features[2]) return word_list class DataSet(object): def __init__(self,data_dir,max_vocab): file_pathes=[] for file_path in glob.glob(data_dir+'*'): file_pathes.append(file_path) row_documents=[self._read_document(file_path)for file_path in file_pathes] documents=[self._preprocessing(document)for document in row_documents] splited_documents=[self._morphological(document)for document in documents] word=[] for word_list in splited_documents: words.extend(word_list) self.id_sequence,self.word_frequency,self.w_to_id,self.id_to_w=self._build_data_sets(words,max_vocab) print('Most common words (+UNK)',self.word_frequency[:5]) print ('Sample data.') print (self.id_sequence[:10]) print([self.id_to_w[i]for i in self.id_sequence[:10]]) self.data_index=0 from data_set import * def create_next_batch(self,batch_size,num_skips,skip_window): assert batch_size% num_skips==0 assert num_skips<=2*skip_window inputs=np.ndarray(shape=(batch_size),dtype=np.int32) labels=np.ndarray(shape=(batch_size,1),dtype=np.int32) span=2*skip_window+1 buffer=collections.deque(maxlen=span) if self.data_index+span>len(self.id_sequence): self.data_index=0 buffer.extend(self.id_sequence[self.data_index:self.data_index+span]) self.data_index +=span for i in range(batch_size//num_skips): target=skip_window targets_to_avoid=[skip_window] for j in range(num_skips): while target in targets_to_avoid: target=random.randint(0,span-1) targets_to_avoid.append(target) inputs[i*num_skips+j]=buffer[skip_window] inputs[i*num_skips+j,0]=buffer[target] if self.data_index==len(self.id_sequence): buffer=self.id_sequence[:span] self.data_index=span else: buffer.append(self.id_sequence[self.data_index]) self.data_index+=1 self.data_index=(self.data_index+len(self.id_sequence)-span)%len(self.id_sequence) return inputs,labels FLAGS=tf.app.flags.FLAGS tf.app.flags.DEFINE_string('data_dir','data/',"Data set directory")#インプット tf.app.flags.DEFINE_string('log_dir','log/',"Log directory") tf.app.flags.DEFINE_integer('max_vocab',2000,"Max Vocablary size.") tf.app.flags.DEFINE_integer('skip_window',2,"How many words to consider left and right") tf.app.flags.DEFINE_integer('num_skips',4,"How many times to reuse an input to generate a label") tf.app.flags.DEFINE_integer('embedding_size',64,"Dimention of the embedding vector") tf.app.flags.DEFINE_integer('num_sumpled',64,"Number of negative examples to sample") tf.app.flags.DEFINE_integer('num_step',10000,"Train steps") tf.app.flags.DEFINE_integer('batch_size',64,"Batch size") tf.app.flags.DEFINE_float('learning_rate',0.1,"learning rate") tf.app.flags.DEFINE_bool('create_tsv',True,"Create words.tsv or not") def main(argv): data=DataSet(FLAGS.data_dir,FLAGS.max_vocab) batch_size=FLAGS.batch_size embedding_size=FLAGS.embedding_size vocab_size=len(data.w_to_id) inputs=tf.placeholder(tf.int32,shape=[batch_size]) correct=tf.placeholder(tf.int32,shape=[batch_size,1]) word_embedding=tf.Variable(tf.random_uniform([vocab_size,embedding_size],-1.0,1.0),name='word_embedding') embed=tf.nn.embedding_lookup(word_embedding,inputs) w_out=tf.Variable(tf.truncated_normal([vocab_size,embedding_size],stddev=1.0/math.sqrt(embedding_size))) b_out=tf.Variable(tf.zeros([vocab_size])) nce_loss=tf.nn.nce_loss(weights=w_out,biases=b_out,labels=correct,inputs=embed,num_sampled=FLAGS.num_sumpled,num_classes=vocab_size) loss=tf.reduce_mean(nce_loss) global_step=tf.Variable(0,name='global_step',trainable=False) train_op=tf.train.GradientDescentOptimizer(FLAGS.learning_rate).minimize(loss,global_step=global_step) init=tf.global_variables_initializer() saver=tf.train.Saver(max_to_keep=3) with tf.Session()as sess: ckpt_state=tf.train.get_checkpoint_state(FLAGS.log_dir) if ckpt_state: last_model=ckpt_state.model_checkpoint_path saver.restore(sess,last_model) print("model was loaded",last_model) else: sess.run(init) print("Initialized") last_step=sess.run(global_step) average_loss=0 for i in range(FLAGS.num_step): step=last_step+i+1 batch_inputs,batch_labels=data.create_next_batch(batch_size,FLAGS.num_skips,FLAGS.skip_window) feed_dict={inputs:batch_inputs,correct:batch_labels} _,loss_val=sess.run([train_op,loss],feed_dict=feed_dict) average_loss+=loss_val if step%100==0: average_loss/=100 print('Average loss at step',step,':',average_loss) average_loss=0 saver.save(sess,FLAGS.log_dir+'my_model.ckpt',step) if __name__ == '__main__': tf.app.run()
ここにより詳細な情報を記載してください。
回答2件
あなたの回答
tips
プレビュー
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。