回答編集履歴

1

修正コードを追加

2019/10/07 05:13

投稿

magichan
magichan

スコア15898

test CHANGED
@@ -83,3 +83,83 @@
83
83
 
84
84
 
85
85
  のようにするとよいのではないでしょうか。
86
+
87
+
88
+
89
+ ---
90
+
91
+
92
+
93
+ **【修正コード】**
94
+
95
+ ```Python
96
+
97
+ from pymongo import MongoClient
98
+
99
+ from bs4 import BeautifulSoup
100
+
101
+ import MeCab
102
+
103
+ from gensim.models import word2vec
104
+
105
+
106
+
107
+ mecab = MeCab.Tagger ('/usr/local/lib/mecab/dic/mecab-ipadic-neologd')
108
+
109
+ def main():
110
+
111
+ recipes = []
112
+
113
+ client = MongoClient('localhost', 27017)
114
+
115
+ db = client.html.cookpad_html
116
+
117
+ collection = db.test_collection
118
+
119
+ htmls = list(db.find().limit(100))
120
+
121
+ recipes = []
122
+
123
+ for num, html in enumerate(htmls):
124
+
125
+ soup = BeautifulSoup(html["html"], 'lxml')
126
+
127
+ for steps in soup.find_all(attrs={"class": "step_text"}):
128
+
129
+ node = mecab.parseToNode(steps.get_text())
130
+
131
+
132
+
133
+ while node:
134
+
135
+ if node.feature.split(",")[0] == '名詞':
136
+
137
+ recipes.append(node.feature.split(",")[6])
138
+
139
+ node = node.next
140
+
141
+ recipes = list(set(recipes))
142
+
143
+ print(recipes)
144
+
145
+
146
+
147
+ model = word2vec.Word2Vec(recipes, size=200,min_count=1)
148
+
149
+
150
+
151
+ out = model.most_similar(positive=[u'チョコ'])
152
+
153
+ for x in out:
154
+
155
+ print (x[0],x[1])
156
+
157
+
158
+
159
+
160
+
161
+ if __name__ == '__main__':
162
+
163
+ main()
164
+
165
+ ```