###前提・実現したいこと
ここに質問したいことを詳細に書いてください
alice_in_wonderland.txtというテキストファイルを用いてword2vecを試したいのですが
エラーが出てしまうのですが原因がわかりません。
###発生している問題・エラーメッセージ
TypeError Traceback (most recent call last)
<ipython-input-15-9d4c438d877e> in <module>()
26 fin.close()
27
---> 28 sents = nltk.sent_tokenize(" ".join(lines))
29
30
TypeError: sequence item 0: expected str instance, bytes found
ここにご自身が実行したソースコードを書いてください
from __future__ import print_function from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.preprocessing.text import Tokenizer, one_hot from sklearn.metrics.pairwise import cosine_distances from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import matplotlib.pyplot as plt import nltk import numpy as np import operator np.random.seed(42) BATCH_SIZE = 128 NUM_EPOCHS = 20 lines = [] fin = open("alice_in_wonderland.txt", "rb") for line in fin: line = line.strip().decode("ascii", "ignore").encode("utf-8") if len(line) == 0: continue lines.append(line) fin.close() sents = nltk.sent_tokenize(" ".join(lines)) tokenizer = Tokenizer(5000) # use top 5000 words only tokens = tokenizer.fit_on_texts(sents) vocab_size = len(tokenizer.word_counts) + 1 xs = [] ys = [] for sent in sents: embedding = one_hot(sent, vocab_size) triples = list(nltk.trigrams(embedding)) w_lefts = [x[0] for x in triples] w_centers = [x[1] for x in triples] w_rights = [x[2] for x in triples] xs.extend(w_centers) ys.extend(w_lefts) xs.extend(w_centers) ys.extend(w_rights) ohe = OneHotEncoder(n_values=vocab_size) X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense() Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense() Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42) print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) model = Sequential() model.add(Dense(300, input_shape=(Xtrain.shape[1],))) model.add(Activation("relu")) model.add(Dropout(0.5)) model.add(Dense(Ytrain.shape[1])) model.add(Activation("softmax")) model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, verbose=1, validation_data=(Xtest, Ytest)) plt.subplot(211) plt.title("accuracy") plt.plot(history.history["acc"], color="r", label="train") plt.plot(history.history["val_acc"], color="b", label="validation") plt.legend(loc="best") plt.subplot(212) plt.title("loss") plt.plot(history.history["loss"], color="r", label="train") plt.plot(history.history["val_loss"], color="b", label="validation") plt.legend(loc="best") plt.tight_layout() plt.show() score = model.evaluate(Xtest, Ytest, verbose=1) print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1])) word2idx = tokenizer.word_index idx2word = {v:k for k, v in word2idx.items()} W, b = model.layers[0].get_weights() idx2emb = {} for word in word2idx.keys(): wid = word2idx[word] vec_in = ohe.fit_transform(np.array(wid)).todense() vec_emb = np.dot(vec_in, W) idx2emb[wid] = vec_emb for word in ["stupid", "alice", "succeeded"]: wid = word2idx[word] source_emb = idx2emb[wid] distances = [] for i in range(1, vocab_size): if i == wid: continue target_emb = idx2emb[i] distances.append(((wid, i), cosine_distances(source_emb, target_emb))) sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10] predictions = [idx2word[x[0][1]] for x in sorted_distances] print("{:s} => {:s}".format(word, ", ".join(predictions)))
###試したこと
###補足情報(言語/FW/ツール等のバージョンなど)
より詳細な情報
回答1件
あなたの回答
tips
プレビュー