pythonのプログラムが途中で動かなくなる

以前と同じリンク内容の内容なのですが

#Doc2Vecで文書を学習させるコードを書いていこう。まずは必要ライブラリをimportする。

import sys
import glob
import traceback
from os import listdir, path
from pyknp import Jumanpp
from gensim import models
from gensim.models.doc2vec import LabeledSentence

#次に、記事ファイルをダウンロードしたディレクトリから取得する関数を定義する。

def corpus_files(base_dir):
    findPath = path.join( base_dir, 'text/**/*.txt') # 「text」まで呼出元で指定させるほうがスマートだが
    paths = glob.glob( findPath,recursive=True)

    docs = []
    ignore = ('LICENSE.txt','README.txt','CHANGES.txt')
    for p in paths:
        if path.basename(p) not in ignore:
            docs.append(p)
    return docs

#そして、記事コンテンツをパスから取得する関数を定義する。

def read_document(path):
    with open(path, 'r',encoding="utf-8") as f:
        return f.read()

#先程インストールした、JUMAN++を使って記事を単語リストに変換する関数を定義しよう。

def split_into_words(text):
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]

#次に、記事コンテンツを単語に分割して、Doc2Vecの入力に使うLabeledSentenceに変換する関数を定義しよう。

def doc_to_sentence(doc, name):
    words = split_into_words(doc)
    return LabeledSentence(words=words, tags=[name])

#これらの関数を組み合わせて、記事のパスリストから、記事コンテンツに変換し、単語分割して、センテンスのジェネレーターを返す関数を定義する。

def corpus_to_sentences(corpus):
    docs   = [read_document(x) for x in corpus]
    for idx, (doc, name) in enumerate(zip(docs, corpus)):
        print('\rPreprocessing {}/{}'.format(idx, len(corpus)))
        try:
        	yield doc_to_sentence(doc, name)
        except:
        	print( traceback.format_exc() )
        finally:
        	print( 'end' ) 

#最後に、Doc2Vecパラメータを渡して、学習させよう。

print( corpus_files('./') )
corpus = corpus_files('./')
sentences = corpus_to_sentences(corpus)

model = models.Doc2Vec(sentences, dm=0, size=300, window=15, alpha=.025,
        min_alpha=.025, min_count=1, sample=1e-6)

print('STRART')
for epoch in range(20):
    print('Epoch: {}'.format(epoch + 1))
    model.train(sentences)
    model.alpha -= (0.025 - 0.0001) / 19
    model.min_alpha = model.alpha

#ここで、デフォルトで設定されているdmに1を設定するとdmpvで学習されることになる。1以外であれば、DBoWで学習される。

#モデルの保存と読み込みは以下のようにして、saveメソッドとloadメソッドにファイル名を指定する。

model.save(r'C:/cygwin64/home/hoge/doc2vec.model')
model = models.Doc2Vec.load('doc2vec.model')
print('END')

このコードの部分で途中で止まってしまいます。

def corpus_to_sentences(corpus):
    docs   = [read_document(x) for x in corpus]
    for idx, (doc, name) in enumerate(zip(docs, corpus)):
        print('\rPreprocessing {}/{}'.format(idx, len(corpus)))
        try:
        	yield doc_to_sentence(doc, name)
        except:
        	print( traceback.format_exc() )
        finally:
        	print( 'end' )

合っているかわかりませんが、一応エラーが出たのかを確認するため、tryで囲んで、printで出力するようにしたのですが、errorは特に出力されずに途中まで実行した後止まって動かなくなります

タスクマネージャーでプロセスが動いているか確認しましたが、CPU使用率やディスク使用率も100%で動かなくなっているということもなく通常の状態でした。

行動規範の内容に同意します

回答1件

ベストアンサー

https://www.sejuku.net/blog/23044

エラーを受け取って書き出してください。

python
1except Exception as e:
2    print(e)

エラーがあった場合yieldしていないのですが、sentencesはNoneになりませんか？

投稿2018/01/04 08:46

mkgrei

総合スコア8560

bigbox267

2018/01/04 09:48

cygwinだと何も出ずにずっと止まったままでしたが、コマンドプロンプトでやるとエラーが表示されましたしかし、今度はこんな感じのエラーが出ていて結局全部失敗していました Preprocessing 5342/7367 [Errno 32] Broken pipe Exception ignored in: <bound method Subprocess.__del__ of <pyknp.juman.juman.Subprocess object at 0x0000000101307F28>> Traceback (most recent call last): File "C:\Users\hoge\AppData\Local\Programs\Python\Python35\lib\site-packages\pyknp-0.3-py3.5.egg\pyknp\juman\juman.py", line 59, in __del__ OSError: [Errno 22] Invalid argument Preprocessing 7366/7367 'utf-8' codec can't decode byte 0x82 in position 7: invalid start byte end Traceback (most recent call last): File "main.py", line 62, in <module> min_alpha=.025, min_count=1, sample=1e-6) File "C:\Users\hoge\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\doc2vec.py", line 660, in __init__ self.train(documents, total_examples=self.corpus_count, epochs=self.iter) File "C:\Users\hoge\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train raise RuntimeError("you must first build vocabulary before training the model") RuntimeError: you must first build vocabulary before training the model Unicode型をバイト型に変換できなかった？ということなのでしょうか def read_document(path): with open(path, 'r',encoding="utf-8") as f: return f.read() このコードutf-8でエンコーディングするという指定をするだけではだめなのですか?