回答編集履歴
1
修正コードを追加
test
CHANGED
@@ -83,3 +83,83 @@
|
|
83
83
|
|
84
84
|
|
85
85
|
のようにするとよいのではないでしょうか。
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
---
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
**【修正コード】**
|
94
|
+
|
95
|
+
```Python
|
96
|
+
|
97
|
+
from pymongo import MongoClient
|
98
|
+
|
99
|
+
from bs4 import BeautifulSoup
|
100
|
+
|
101
|
+
import MeCab
|
102
|
+
|
103
|
+
from gensim.models import word2vec
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
mecab = MeCab.Tagger ('/usr/local/lib/mecab/dic/mecab-ipadic-neologd')
|
108
|
+
|
109
|
+
def main():
|
110
|
+
|
111
|
+
recipes = []
|
112
|
+
|
113
|
+
client = MongoClient('localhost', 27017)
|
114
|
+
|
115
|
+
db = client.html.cookpad_html
|
116
|
+
|
117
|
+
collection = db.test_collection
|
118
|
+
|
119
|
+
htmls = list(db.find().limit(100))
|
120
|
+
|
121
|
+
recipes = []
|
122
|
+
|
123
|
+
for num, html in enumerate(htmls):
|
124
|
+
|
125
|
+
soup = BeautifulSoup(html["html"], 'lxml')
|
126
|
+
|
127
|
+
for steps in soup.find_all(attrs={"class": "step_text"}):
|
128
|
+
|
129
|
+
node = mecab.parseToNode(steps.get_text())
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
while node:
|
134
|
+
|
135
|
+
if node.feature.split(",")[0] == '名詞':
|
136
|
+
|
137
|
+
recipes.append(node.feature.split(",")[6])
|
138
|
+
|
139
|
+
node = node.next
|
140
|
+
|
141
|
+
recipes = list(set(recipes))
|
142
|
+
|
143
|
+
print(recipes)
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
model = word2vec.Word2Vec(recipes, size=200,min_count=1)
|
148
|
+
|
149
|
+
|
150
|
+
|
151
|
+
out = model.most_similar(positive=[u'チョコ'])
|
152
|
+
|
153
|
+
for x in out:
|
154
|
+
|
155
|
+
print (x[0],x[1])
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
|
161
|
+
if __name__ == '__main__':
|
162
|
+
|
163
|
+
main()
|
164
|
+
|
165
|
+
```
|