現在twitterのつぶやきを感情分析する自然言語処理をしており、bag of wordsを用いてつぶやきのベクトル化を試みています。
bag of wordsにてベクトル化した文書ベクトルをLSIを用いて次元削減をしようとしているのですがなぜか指定した次元数にならず学習の際にエラーが出てしまいます。
理由が全く分からず質問させていただきました。
やろうとしているフローチャート:
DBからつぶやき取得
↓
形態素解析
↓
bag of wordsでベクトル化
↓
LSIを用いて150次元に削減 ←ここで次元数の指定がうまくいかない
↓
学習・推定
エラー文:
Exception in main training loop: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 150 and the array at index 14 has size 149
Traceback (most recent call last):
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/training/trainer.py", line 319, in run
entry.extension(self)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/training/extensions/evaluator.py", line 161, in call
result = self.evaluate()
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/training/extensions/evaluator.py", line 216, in evaluate
self.converter, batch, self.device)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 73, in _call_converter
return converter(batch, device)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 58, in wrap_call
return func(*args, **kwargs)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 223, in concat_examples
[example[i] for example in batch], padding[i])))
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 254, in _concat_arrays
[array[None] for array in arrays])
File "<array_function internals>", line 6, in concatenate
Will finalize trainer extensions and updater before reraising the exception.
####trainer.run()で学習を行う際に入力ベクトルの次元数が揃っていないため学習が行えないというエラー####
Traceback (most recent call last):
File "train_review.py", line 171, in <module>
trainer.run()
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/training/trainer.py", line 349, in run
six.reraise(*exc_info)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/six.py", line 693, in reraise
raise value
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/training/trainer.py", line 319, in run
entry.extension(self)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/training/extensions/evaluator.py", line 161, in call
result = self.evaluate()
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/training/extensions/evaluator.py", line 216, in evaluate
self.converter, batch, self.device)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 73, in _call_converter
return converter(batch, device)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 58, in wrap_call
return func(*args, **kwargs)
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 223, in concat_examples
[example[i] for example in batch], padding[i])))
File "/Users/kojimakazuya/anaconda3/lib/python3.7/site-packages/chainer/dataset/convert.py", line 254, in _concat_arrays
[array[None] for array in arrays])
File "<array_function internals>", line 6, in concatenate
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 150 and the array at index 14 has size 149
↑のように所々149次元(データセットは全6539件)になったりして次元数が揃ってないことによって学習ができなくなっています。
python
1import db 2import pandas as pd 3import numpy as np 4import time 5import MeCab 6import datetime 7import pymysql.cursors 8import gensim 9import mlp 10import random 11from gensim import corpora,matutils 12import pickle 13import screaning 14from chainer import serializers 15 16# DB操作用にカーソルを作成 17cursor = db.conn.cursor() 18 19try: 20 cursor.execute('select label, sentence from tweet_to_sentence where id <= 7000') 21 result = cursor.fetchall() 22finally: 23 cursor.close() 24 db.conn.close() 25 26mecab = MeCab.Tagger ('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') 27mecab.parse('')#文字列がGCされるのを防ぐ 28 29words_list = [] 30label = [] 31 32# 形態素解析(BoW) 33for sentence in result: 34 sentence_label = sentence["label"] 35 words = sentence["sentence"] 36 node = mecab.parseToNode(words) 37 word_list = [] 38 while node: 39 if node.feature.split(",")[0] == u"名詞" and node.feature.split(",")[1] != u"固有名詞": 40 if node.feature.split(",")[1] != u"数" and len(node.feature.split(",")[6]) >= 2: 41 word_list.append(node.surface) 42 elif node.feature.split(",")[0] == u"形容詞": 43 word_list.append(node.feature.split(",")[6]) 44 elif node.feature.split(",")[0] == u"動詞": 45 word_list.append(node.feature.split(",")[6]) 46 elif node.feature.split(",")[6] == u"ない": 47 word_list.append(node.feature.split(",")[6]) 48 node = node.next 49 50 # 空行列は無視 51 if not word_list: 52 continue 53 54 label.append(sentence_label) 55 words_list.append(word_list) 56 57# BoWによるベクトル化 58dictionary = corpora.Dictionary(words_list) 59num=[] 60tweet_vec=[] 61 62bow_corpus = [dictionary.doc2bow(d) for d in words_list] 63tfidf_model = gensim.models.TfidfModel(bow_corpus) 64tfidf_corpus = tfidf_model[bow_corpus] 65 66lsi_model = gensim.models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=150)#ここで次元数を指定しているが?? 67 68lsi_corpus = lsi_model[tfidf_corpus] 69 70for i in range(len(lsi_corpus)): 71 w_vec = [] 72 for vec in lsi_corpus[i]: 73 w_vec.append(list(vec)[1]) 74 w_vec = np.array(w_vec,dtype="float32") 75 # w_vec = w_vec.reshape([13,13]) 76 # w_vec = w_vec[np.newaxis,:,:] 77 tweet_vec.append(w_vec) 78tweet_vec = np.array(tweet_vec) 79 80 81#アンダーサンプリング 82data_set = list(zip(tweet_vec, label)) 83data1 = [] 84data2 = [] 85for i in range(len(data_set)): 86 if data_set[i][1] == 0: 87 data1.append(data_set[i]) 88 elif data_set[i][1] == 1: 89 data2.append(data_set[i]) 90 91data1 = random.sample(data1, len(data2)) 92x1, t1 = list(zip(*data1)) 93x2, t2 = list(zip(*data2)) 94 95train_tweet = x1[:int(len(x1)*0.8)] + x2[:int(len(x2)*0.8)] 96test_tweet = x1[int(len(x1)*0.8):] + x2[int(len(x2)*0.8):] 97train_label = t1[:int(len(t1)*0.8)] + t2[:int(len(t2)*0.8)] 98test_label = t1[int(len(t1)*0.8):] + t2[int(len(t2)*0.8):] 99 100# 学習(MLP) 101train = list(zip(train_tweet,train_label)) 102test = list(zip(test_tweet,test_label)) 103batchsize = 30 104train_iter = mlp.iterators.SerialIterator(train, batchsize, shuffle=True, repeat=True) 105test_iter = mlp.iterators.SerialIterator(test, batchsize, shuffle=True, repeat=False) 106 107model = mlp.MLP() 108net = mlp.L.Classifier(model) 109optimizer = mlp.optimizers.MomentumSGD(lr=0.01).setup(net) 110updater = mlp.training.StandardUpdater(train_iter, optimizer, device=-1) # device=-1でCPUでの計算実行を指定 111 112epoch = 100 113trainer = mlp.training.Trainer(updater, (epoch, 'epoch'), out='result') 114 115# テストデータで評価 116trainer.extend(mlp.extensions.Evaluator(test_iter, net, device = -1)) 117 118# 学習を記録 119trainer.extend(mlp.extensions.LogReport(trigger=(1, 'epoch'))) 120trainer.extend(mlp.extensions.PrintReport(['epoch', 'main/loss', 'main/accuracy','validation/main/loss', 'validation/main/accuracy', 'elapsed_time']), trigger=(1, 'epoch')) 121 122trainer.run() 123 124#モデルを保存 125serializers.save_npz("mlpmodel_review.npz", net) 126 127
あなたの回答
tips
プレビュー