teratail header banner
teratail header banner
質問するログイン新規登録

質問編集履歴

1

いろいろ

2018/11/12 08:14

投稿

退会済みユーザー
title CHANGED
@@ -1,1 +1,1 @@
1
- 英語の単語数カウント
1
+ nltkを用いた英語の単語数カウント
body CHANGED
@@ -7,303 +7,73 @@
7
7
  ### 該当のソースコード
8
8
 
9
9
  ```python3.
10
- # -*- coding:utf-8 -*-
10
+ # -*- coding: utf-8 -*-
11
- #レビュー抽出 実行ファイル
11
+ # 単語の出現頻度をExcelに
12
12
 
13
-
14
- from bs4 import BeautifulSoup
15
- import urllib.request
16
- import urllib.parse
17
- import random
13
+ import openpyxl
18
- from time import sleep
19
14
  import os
15
+ import juman
20
16
  import codecs
17
+ import glob
18
+ from collections import Counter
21
19
 
22
- import ssl
23
- ssl._create_default_https_context = ssl._create_unverified_context
24
-
25
- ### "次へ"のページのurl取得 1
26
- ### レビューのページ数分のurl(1,2,…最後 までのurl)
27
- def Get_nextpage_url_jp(url):
20
+ def Juman_for_Frequency(spotname):
21
+ try:
28
- links = []
22
+ word = [] # 単語
29
- opener = urllib.request.build_opener()
30
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
31
- html = opener.open(url)
32
- soup = BeautifulSoup(html,"lxml")
33
-
34
- spotname = soup.find("h1",{"class":"ui_header h1"}).get_text().strip() # タグ指定
35
- links.append(url)
36
- #次のページのurl(最後の口コミのとこまで)
37
- for url2 in links:
38
- html = opener.open(url2)
39
- soup = BeautifulSoup(html,"lxml")
40
23
  try:
41
- link = soup.find("a",{"class":"nav next taLnk "}).get("href") # タグ指定
24
+ fr = codecs.open(spotname, "rb", "sjis", "ignore") #読み込み
25
+ texts = fr.read().split("\n") #ファイルを1行ずつリストとしてtextに
26
+ fr.close()
27
+ texts = list(filter(lambda a: a != "", texts)) #空リスト削除
28
+ debug = 0
29
+ for text in texts:
30
+ debug +=1
42
- links.append("https://www.tripadvisor.jp" + link)
31
+ print("open_file = " + spotname + "lines = " + str(debug))
43
- except:
44
- try:
45
- link = soup.find("a",{"class":"nav next taLnk ui_button primary"}).get("href") # タグ指定
46
- links.append("https://www.tripadvisor.jp" + link)
32
+ word = word + juman.keitaiso(text) #分かち書きしたワード
47
- except:
48
- print("Not next")
33
+ counter = Counter(word) #同じ文字列をカウントしてcounterに
49
- return links, spotname
50
34
 
51
- ### レビューの詳細ページ取得 2
52
- def Get_review_url_jp(url):
35
+ except Exception:
53
- links = []
36
+ import traceback
54
- opener = urllib.request.build_opener()
55
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
56
- html = opener.open(url)
37
+ traceback.print_exc()
57
- soup = BeautifulSoup(html,"lxml")
58
38
 
59
- #url取得
60
- for div in soup.find_all("div",{"class":"ui_column is-9"}):
61
- try:
62
- links.append("https://www.tripadvisor.jp" + div.a.get("href"))
63
- except:
39
+ except Exception:
64
- print("not url")
65
- return links
66
-
67
- ### 英語 1
68
- def Get_review_url_en(url):
69
- links = []
70
- opener = urllib.request.build_opener()
71
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
72
- html = opener.open(url)
73
- soup = BeautifulSoup(html,"lxml")
74
-
75
- #url取得
76
- for div in soup.find_all("div",{"class":"ui_column is-9"}):
77
- try:
78
- links.append("https://www.tripadvisor.com" + div.a.get("href"))
79
- except:
80
- print("not url")
81
- return links
82
-
83
- ### 英語 2
84
- def Get_nextpage_url_en(url):
85
- links = []
86
- opener = urllib.request.build_opener()
87
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
88
- html = opener.open(url)
89
- soup = BeautifulSoup(html,"lxml")
90
-
91
- spotname = soup.find("h1",{"class":"ui_header h1"}).get_text().strip() # タグ指定
92
- links.append(url)
93
- #次のページのurl(最後の口コミのとこまで)
94
- for url2 in links:
95
- html = opener.open(url2)
96
- soup = BeautifulSoup(html,"lxml")
97
- try:
98
- link = soup.find("a",{"class":"nav next taLnk "}).get("href") # タグ指定
99
- links.append("https://www.tripadvisor.com" + link)
100
- except:
101
- try:
102
- link = soup.find("a",{"class":"nav next taLnk ui_button primary"}).get("href") # タグ指定
103
- links.append("https://www.tripadvisor.com" + link)
104
- except:
105
- print("Not next")
106
- return links, spotname
107
-
108
-
109
-
110
- ### review取得 3
111
- def Get_review_data(url):
112
- try:
113
- title = [] #タイトル
114
- review = [] #レビュー
115
- opener = urllib.request.build_opener()
116
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
117
- html = opener.open(url)
118
- soup = BeautifulSoup(html, "lxml")
119
-
120
- #タイトル抽出1つ
121
- try:
122
- t = soup.find("h1",{"class":"title"})
123
- title = t.get_text().replace("\n","").replace("\r","").replace(",", "").strip()
124
- except:
125
- t = soup.find("span",{"class":"noQuotes"})
126
- title = t.get_text().replace("\n","").replace("\r","").replace(",", "").strip()
127
- # for url in soup.find_all("span",{"class":"noQuotes"}): # 必要なタグ指定
128
-
129
- #レビュー抽出1つ
130
- try:
131
- r = soup.find("span",{"class":"fullText "})
132
- review = r.get_text().replace(",", "").replace("\n", "").replace("\r\n", "").replace("\r","").strip()
133
- except:
134
- r = soup.find("p",{"class":"partial_entry"})
135
- review = r.get_text().replace(",", "").replace("\n", "").replace("\r\n", "").replace("\r","").strip()
136
- except:
137
40
  import traceback
138
41
  traceback.print_exc()
139
42
 
140
- return title, review # [文字列, 文字列]
43
+ return counter
141
44
 
142
45
 
143
- ### 取得してきたもの全てをcsvファイルに保存 4
144
- def Save_path(name):
145
- path = os.getcwd() # このプログラムの場所
146
- savdir = "\review\" # "review"を保存するディレクトリ
147
- if os.path.isdir(path + savdir) == False: # "review"というフォルダが無ければ作成
148
- os.mkdir(path + savdir)
149
- filename = name + ".csv"
150
- filepath = path + savdir + filename
46
+ def Save_File(data, save_directory, save_filename):
47
+ #ファイルに書き込み
48
+ wb = openpyxl.Workbook()
151
- return filepath
49
+ ws = wb.active
152
50
 
153
- def Write_file(filepath, data):
154
- #ファイルに書き込み w:上書き a:追記
51
+ num = 1
155
- file_object= codecs.open(filepath, "a", "cp932", "ignore")
52
+ for k,v in sorted(data.items(),key=lambda x:x[1],reverse=True):
53
+ key = []
156
- file_object.write(str(data) + "\n")
54
+ key = k.split(",")
55
+ ws.cell(column=1, row=num).value=key[0] # 単語
56
+ ws.cell(column=2, row=num).value=key[1] # 品詞
157
- file_object.close()
57
+ ws.cell(column=3, row=num).value=v # 出現回数
58
+ num += 1
158
59
 
60
+ wb.save(save_directory + save_filename)
61
+ print("Save_Complete!")
159
62
 
160
63
 
64
+ if __name__ == '__main__':
65
+ path = os.getcwd() # このプログラムの場所
66
+ textrank_dir = "\textrank\" # "textrank"を保存するディレクトリ
67
+ review_dir = "\review\" # "review"ディレクトリ
68
+ if os.path.isdir(path + textrank_dir) == False: # "textrank"というフォルダが無ければ作成
161
- ### 実行部分_jp
69
+ os.mkdir(path + textrank_dir)
162
- def Start_extract_review_jp(start_url):
70
+ os.chdir("review/")
163
- #各値 初期化
164
- title = []
165
- review = []
166
- links = []
167
- urls = []
168
71
 
169
- print("get_nextpage_url: Start")
170
- try:
171
- links, spotname = Get_nextpage_url_jp(start_url)
172
- except:
173
- print("get_nextpage_url is error")
174
- print("Total Next url = " + str(len(links)))
175
- print("get_review_url: Start")
72
+ csv_files = glob.glob("*.csv")
176
- #1つずつurlを
73
+ save_directory = path + textrank_dir
177
- for link in links:
74
+ for spotname in csv_files:
178
- try:
75
+ print(spotname)
179
- urls.extend(Get_review_url_jp(link)) #必要
76
+ get_data = Juman_for_Frequency(spotname)
180
- except:
181
- print("get_review_url is error")
77
+ Save_File(get_data, save_directory, spotname)
182
- print("Total review = " + str(len(urls)))
183
- #urls = review数
184
- print("Get_review_data: Start")
185
- debug = 0
186
78
 
187
- #filename指定してcsvに保存
188
- try:
189
- filepath = Save_path(spotname)
190
- except:
191
- print("Save_path is error")
192
-
193
- #file_open
194
- file_object= codecs.open(filepath, "a", "cp932", "ignore")
195
- for url in urls:
196
- debug += 1
197
- print("review = " + str(debug))
198
-
199
- interval = 5 + random.uniform(-3.0, 3.0)
200
- print("interval 1: begin")
201
- sleep(interval)
202
- print("interval 1: end")
203
-
204
- try:
205
- title, review = Get_review_data(url)
206
- except:
207
- print("Get_review_data is error")
208
-
209
- file_object.write("{}\s{}\n".format(str(title), str(review)))
210
-
211
- file_object.close()
212
- print("save: Complete")
213
-
214
- ### 実行部分_jp
215
- def Start_extract_review_en(start_url):
216
- #各値 初期化
217
- title = []
218
- review = []
219
- links = []
220
- urls = []
221
-
222
- print("get_nextpage_url: Start")
223
- try:
224
- links, spotname = Get_nextpage_url_en(start_url)
225
- except:
226
- print("get_nextpage_url is error")
227
- print("Total Next url = " + str(len(links)))
228
- print("get_review_url: Start")
229
- #1つずつurlを
230
- for link in links:
231
- try:
232
- urls.extend(Get_review_url_en(link)) #必要
233
- except:
234
- print("get_review_url is error")
235
- print("Total review = " + str(len(urls)))
236
- #urls = review数
237
- print("Get_review_data: Start")
238
- debug = 0
239
-
240
- #filename指定してcsvに保存
241
- try:
242
- filepath = Save_path(spotname)
243
- except:
244
- print("Save_path is error")
245
-
246
- #file_open
247
- file_object= codecs.open(filepath, "a", "cp932", "ignore")
248
- for url in urls:
249
- debug += 1
250
- print("review = " + str(debug))
251
-
252
- interval = 5 + random.uniform(-3.0, 3.0)
253
- print("interval 1: begin")
254
- sleep(interval)
255
- print("interval 1: end")
256
-
257
- try:
258
- title, review = Get_review_data(url)
259
- except:
260
- print("Get_review_data is error")
261
-
262
- file_object.write("{}\s{}\n".format(str(title), str(review)))
263
-
264
- file_object.close()
265
- print("save: Complete")
266
-
267
-
268
- if __name__ =='__main__':
269
-
270
- """
271
- #debug Get_next_url
272
- #"次へ"のページのurl取得 1
273
- #url = "https://www.tripadvisor.jp/Attraction_Review-g1022838-d1548549-Reviews-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html" #最初のページ
274
- url = "https://www.tripadvisor.com/Attraction_Review-g1022838-d1548549-Reviews-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
275
- links = Get_nextpage_url_jp(url)
276
- print(links)
277
- """
278
-
279
- """
280
- #debug Get_review_url
281
- # レビューの詳細ページ取得 2
282
- #url = "https://www.tripadvisor.jp/Attraction_Review-g298562-d1384635-Reviews-Suma_Rikyu_Park-Kobe_Hyogo_Prefecture_Kinki.html"
283
- url = "https://www.tripadvisor.com/Attraction_Review-g1022838-d1548549-Reviews-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
284
- links = Get_review_url(url)
285
- print(links)
286
- """
287
-
288
- """
289
- #debug Get_review_data
290
- # review 取得 3
291
- #url = "https://www.tripadvisor.jp/ShowUserReviews-g298562-d1384635-r631487360-Suma_Rikyu_Park-Kobe_Hyogo_Prefecture_Kinki.html"
292
- #url = "https://www.tripadvisor.jp/ShowUserReviews-g1022838-d1548549-r631449250-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
293
- url = "https://www.tripadvisor.com/ShowUserReviews-g1022838-d1548549-r536077279-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
294
- title, review = Get_review_data(url)
295
- print("{}\s{}".format(title,review))
296
- """
297
-
298
- #"""
299
- ###手打ち###
300
- url_jp = "https://www.tripadvisor.jp/Attraction_Review-g1121309-d5017432-Reviews-Amarube_Railroad_Bridge_Sorano_Eki-Kami_cho_Mikata_gun_Hyogo_Prefecture_Kinki.html"
301
- ############
302
-
303
- url_en = url_jp.replace("https://www.tripadvisor.jp/", "https://www.tripadvisor.com/")
304
-
305
- Start_extract_review_jp(url_jp)
306
- Start_extract_review_en(url_en)
307
- #"""
308
-
309
79
  ```