質問編集履歴

1

いろいろ

2018/11/12 08:14

投稿

退会済みユーザー
test CHANGED
@@ -1 +1 @@
1
- 英語の単語数カウント
1
+ nltkを用いた英語の単語数カウント
test CHANGED
@@ -16,259 +16,65 @@
16
16
 
17
17
  ```python3.
18
18
 
19
- # -*- coding:utf-8 -*-
19
+ # -*- coding: utf-8 -*-
20
20
 
21
- #レビュー抽 実行ファイル
21
+ # 単語の現頻度をExcelに
22
22
 
23
23
 
24
24
 
25
-
26
-
27
- from bs4 import BeautifulSoup
28
-
29
- import urllib.request
30
-
31
- import urllib.parse
32
-
33
- import random
25
+ import openpyxl
34
-
35
- from time import sleep
36
26
 
37
27
  import os
38
28
 
29
+ import juman
30
+
39
31
  import codecs
32
+
33
+ import glob
34
+
35
+ from collections import Counter
40
36
 
41
37
 
42
38
 
43
- import ssl
39
+ def Juman_for_Frequency(spotname):
44
40
 
41
+ try:
42
+
43
+ word = [] # 単語
44
+
45
+ try:
46
+
47
+ fr = codecs.open(spotname, "rb", "sjis", "ignore") #読み込み
48
+
49
+ texts = fr.read().split("\n") #ファイルを1行ずつリストとしてtextに
50
+
51
+ fr.close()
52
+
45
- ssl._create_default_https_context = ssl._create_unverified_context
53
+ texts = list(filter(lambda a: a != "", texts)) #空リスト削除
54
+
55
+ debug = 0
56
+
57
+ for text in texts:
58
+
59
+ debug +=1
60
+
61
+ print("open_file = " + spotname + "lines = " + str(debug))
62
+
63
+ word = word + juman.keitaiso(text) #分かち書きしたワード
64
+
65
+ counter = Counter(word) #同じ文字列をカウントしてcounterに
46
66
 
47
67
 
48
68
 
49
- ### "次へ"のページのurl取得 1
69
+ except Exception:
50
70
 
51
- ### レビューのページ数分のurl(1,2,…最後 までのurl)
71
+ import traceback
52
72
 
53
- def Get_nextpage_url_jp(url):
54
-
55
- links = []
56
-
57
- opener = urllib.request.build_opener()
58
-
59
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
60
-
61
- html = opener.open(url)
73
+ traceback.print_exc()
62
-
63
- soup = BeautifulSoup(html,"lxml")
64
74
 
65
75
 
66
76
 
67
- spotname = soup.find("h1",{"class":"ui_header h1"}).get_text().strip() # タグ指定
68
-
69
- links.append(url)
70
-
71
- #次のページのurl(最後の口コミのとこまで)
72
-
73
- for url2 in links:
74
-
75
- html = opener.open(url2)
76
-
77
- soup = BeautifulSoup(html,"lxml")
78
-
79
- try:
80
-
81
- link = soup.find("a",{"class":"nav next taLnk "}).get("href") # タグ指定
82
-
83
- links.append("https://www.tripadvisor.jp" + link)
84
-
85
- except:
77
+ except Exception:
86
-
87
- try:
88
-
89
- link = soup.find("a",{"class":"nav next taLnk ui_button primary"}).get("href") # タグ指定
90
-
91
- links.append("https://www.tripadvisor.jp" + link)
92
-
93
- except:
94
-
95
- print("Not next")
96
-
97
- return links, spotname
98
-
99
-
100
-
101
- ### レビューの詳細ページ取得 2
102
-
103
- def Get_review_url_jp(url):
104
-
105
- links = []
106
-
107
- opener = urllib.request.build_opener()
108
-
109
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
110
-
111
- html = opener.open(url)
112
-
113
- soup = BeautifulSoup(html,"lxml")
114
-
115
-
116
-
117
- #url取得
118
-
119
- for div in soup.find_all("div",{"class":"ui_column is-9"}):
120
-
121
- try:
122
-
123
- links.append("https://www.tripadvisor.jp" + div.a.get("href"))
124
-
125
- except:
126
-
127
- print("not url")
128
-
129
- return links
130
-
131
-
132
-
133
- ### 英語 1
134
-
135
- def Get_review_url_en(url):
136
-
137
- links = []
138
-
139
- opener = urllib.request.build_opener()
140
-
141
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
142
-
143
- html = opener.open(url)
144
-
145
- soup = BeautifulSoup(html,"lxml")
146
-
147
-
148
-
149
- #url取得
150
-
151
- for div in soup.find_all("div",{"class":"ui_column is-9"}):
152
-
153
- try:
154
-
155
- links.append("https://www.tripadvisor.com" + div.a.get("href"))
156
-
157
- except:
158
-
159
- print("not url")
160
-
161
- return links
162
-
163
-
164
-
165
- ### 英語 2
166
-
167
- def Get_nextpage_url_en(url):
168
-
169
- links = []
170
-
171
- opener = urllib.request.build_opener()
172
-
173
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
174
-
175
- html = opener.open(url)
176
-
177
- soup = BeautifulSoup(html,"lxml")
178
-
179
-
180
-
181
- spotname = soup.find("h1",{"class":"ui_header h1"}).get_text().strip() # タグ指定
182
-
183
- links.append(url)
184
-
185
- #次のページのurl(最後の口コミのとこまで)
186
-
187
- for url2 in links:
188
-
189
- html = opener.open(url2)
190
-
191
- soup = BeautifulSoup(html,"lxml")
192
-
193
- try:
194
-
195
- link = soup.find("a",{"class":"nav next taLnk "}).get("href") # タグ指定
196
-
197
- links.append("https://www.tripadvisor.com" + link)
198
-
199
- except:
200
-
201
- try:
202
-
203
- link = soup.find("a",{"class":"nav next taLnk ui_button primary"}).get("href") # タグ指定
204
-
205
- links.append("https://www.tripadvisor.com" + link)
206
-
207
- except:
208
-
209
- print("Not next")
210
-
211
- return links, spotname
212
-
213
-
214
-
215
-
216
-
217
-
218
-
219
- ### review取得 3
220
-
221
- def Get_review_data(url):
222
-
223
- try:
224
-
225
- title = [] #タイトル
226
-
227
- review = [] #レビュー
228
-
229
- opener = urllib.request.build_opener()
230
-
231
- opener.addheaders = [('User-agent', 'Mozilla/5.0')]
232
-
233
- html = opener.open(url)
234
-
235
- soup = BeautifulSoup(html, "lxml")
236
-
237
-
238
-
239
- #タイトル抽出1つ
240
-
241
- try:
242
-
243
- t = soup.find("h1",{"class":"title"})
244
-
245
- title = t.get_text().replace("\n","").replace("\r","").replace(",", "").strip()
246
-
247
- except:
248
-
249
- t = soup.find("span",{"class":"noQuotes"})
250
-
251
- title = t.get_text().replace("\n","").replace("\r","").replace(",", "").strip()
252
-
253
- # for url in soup.find_all("span",{"class":"noQuotes"}): # 必要なタグ指定
254
-
255
-
256
-
257
- #レビュー抽出1つ
258
-
259
- try:
260
-
261
- r = soup.find("span",{"class":"fullText "})
262
-
263
- review = r.get_text().replace(",", "").replace("\n", "").replace("\r\n", "").replace("\r","").strip()
264
-
265
- except:
266
-
267
- r = soup.find("p",{"class":"partial_entry"})
268
-
269
- review = r.get_text().replace(",", "").replace("\n", "").replace("\r\n", "").replace("\r","").strip()
270
-
271
- except:
272
78
 
273
79
  import traceback
274
80
 
@@ -276,341 +82,75 @@
276
82
 
277
83
 
278
84
 
279
- return title, review # [文字列, 文字列]
85
+ return counter
280
86
 
281
87
 
282
88
 
283
89
 
284
90
 
285
- ### 取得してきたもの全てをcsvファイルに保存 4
91
+ def Save_File(data, save_directory, save_filename):
286
92
 
287
- def Save_path(name):
93
+ #ファイルに書き込み
288
94
 
289
- path = os.getcwd() # このプログラムの場所
95
+ wb = openpyxl.Workbook()
290
96
 
291
- savdir = "\review\" # "review"を保存するディレクトリ
292
-
293
- if os.path.isdir(path + savdir) == False: # "review"というフォルダが無ければ作成
294
-
295
- os.mkdir(path + savdir)
97
+ ws = wb.active
296
-
297
- filename = name + ".csv"
298
-
299
- filepath = path + savdir + filename
300
-
301
- return filepath
302
98
 
303
99
 
304
100
 
305
- def Write_file(filepath, data):
101
+ num = 1
306
102
 
307
- #ファイルに書き込み w:上書き a:追記
103
+ for k,v in sorted(data.items(),key=lambda x:x[1],reverse=True):
308
104
 
309
- file_object= codecs.open(filepath, "a", "cp932", "ignore")
105
+ key = []
310
106
 
311
- file_object.write(str(data) + "\n")
107
+ key = k.split(",")
312
108
 
109
+ ws.cell(column=1, row=num).value=key[0] # 単語
110
+
111
+ ws.cell(column=2, row=num).value=key[1] # 品詞
112
+
113
+ ws.cell(column=3, row=num).value=v # 出現回数
114
+
115
+ num += 1
116
+
117
+
118
+
119
+ wb.save(save_directory + save_filename)
120
+
313
- file_object.close()
121
+ print("Save_Complete!")
314
122
 
315
123
 
316
124
 
317
125
 
318
126
 
127
+ if __name__ == '__main__':
319
128
 
129
+ path = os.getcwd() # このプログラムの場所
320
130
 
321
- ### 実行部分_jp
131
+ textrank_dir = "\textrank\" # "textrank"を保存するディレクトリ
322
132
 
323
- def Start_extract_review_jp(start_url):
133
+ review_dir = "\review\" # "review"ディレクトリ
324
134
 
325
- #各値 初期化
135
+ if os.path.isdir(path + textrank_dir) == False: # "textrank"というフォルダが無ければ作成
326
136
 
327
- title = []
137
+ os.mkdir(path + textrank_dir)
328
138
 
329
- review = []
139
+ os.chdir("review/")
330
-
331
- links = []
332
-
333
- urls = []
334
140
 
335
141
 
336
142
 
337
- print("get_nextpage_url: Start")
143
+ csv_files = glob.glob("*.csv")
338
144
 
339
- try:
145
+ save_directory = path + textrank_dir
340
146
 
341
- links, spotname = Get_nextpage_url_jp(start_url)
147
+ for spotname in csv_files:
342
148
 
343
- except:
149
+ print(spotname)
344
150
 
345
- print("get_nextpage_url is error")
151
+ get_data = Juman_for_Frequency(spotname)
346
152
 
347
- print("Total Next url = " + str(len(links)))
348
-
349
- print("get_review_url: Start")
350
-
351
- #1つずつurlを
352
-
353
- for link in links:
354
-
355
- try:
356
-
357
- urls.extend(Get_review_url_jp(link)) #必要
358
-
359
- except:
360
-
361
- print("get_review_url is error")
362
-
363
- print("Total review = " + str(len(urls)))
364
-
365
- #urls = review数
366
-
367
- print("Get_review_data: Start")
368
-
369
- debug = 0
370
-
371
-
372
-
373
- #filename指定してcsvに保存
374
-
375
- try:
376
-
377
- filepath = Save_path(spotname)
153
+ Save_File(get_data, save_directory, spotname)
378
-
379
- except:
380
-
381
- print("Save_path is error")
382
-
383
-
384
-
385
- #file_open
386
-
387
- file_object= codecs.open(filepath, "a", "cp932", "ignore")
388
-
389
- for url in urls:
390
-
391
- debug += 1
392
-
393
- print("review = " + str(debug))
394
-
395
-
396
-
397
- interval = 5 + random.uniform(-3.0, 3.0)
398
-
399
- print("interval 1: begin")
400
-
401
- sleep(interval)
402
-
403
- print("interval 1: end")
404
-
405
-
406
-
407
- try:
408
-
409
- title, review = Get_review_data(url)
410
-
411
- except:
412
-
413
- print("Get_review_data is error")
414
-
415
-
416
-
417
- file_object.write("{}\s{}\n".format(str(title), str(review)))
418
-
419
-
420
-
421
- file_object.close()
422
-
423
- print("save: Complete")
424
-
425
-
426
-
427
- ### 実行部分_jp
428
-
429
- def Start_extract_review_en(start_url):
430
-
431
- #各値 初期化
432
-
433
- title = []
434
-
435
- review = []
436
-
437
- links = []
438
-
439
- urls = []
440
-
441
-
442
-
443
- print("get_nextpage_url: Start")
444
-
445
- try:
446
-
447
- links, spotname = Get_nextpage_url_en(start_url)
448
-
449
- except:
450
-
451
- print("get_nextpage_url is error")
452
-
453
- print("Total Next url = " + str(len(links)))
454
-
455
- print("get_review_url: Start")
456
-
457
- #1つずつurlを
458
-
459
- for link in links:
460
-
461
- try:
462
-
463
- urls.extend(Get_review_url_en(link)) #必要
464
-
465
- except:
466
-
467
- print("get_review_url is error")
468
-
469
- print("Total review = " + str(len(urls)))
470
-
471
- #urls = review数
472
-
473
- print("Get_review_data: Start")
474
-
475
- debug = 0
476
-
477
-
478
-
479
- #filename指定してcsvに保存
480
-
481
- try:
482
-
483
- filepath = Save_path(spotname)
484
-
485
- except:
486
-
487
- print("Save_path is error")
488
-
489
-
490
-
491
- #file_open
492
-
493
- file_object= codecs.open(filepath, "a", "cp932", "ignore")
494
-
495
- for url in urls:
496
-
497
- debug += 1
498
-
499
- print("review = " + str(debug))
500
-
501
-
502
-
503
- interval = 5 + random.uniform(-3.0, 3.0)
504
-
505
- print("interval 1: begin")
506
-
507
- sleep(interval)
508
-
509
- print("interval 1: end")
510
-
511
-
512
-
513
- try:
514
-
515
- title, review = Get_review_data(url)
516
-
517
- except:
518
-
519
- print("Get_review_data is error")
520
-
521
-
522
-
523
- file_object.write("{}\s{}\n".format(str(title), str(review)))
524
-
525
-
526
-
527
- file_object.close()
528
-
529
- print("save: Complete")
530
-
531
-
532
-
533
-
534
-
535
- if __name__ =='__main__':
536
-
537
-
538
-
539
- """
540
-
541
- #debug Get_next_url
542
-
543
- #"次へ"のページのurl取得 1
544
-
545
- #url = "https://www.tripadvisor.jp/Attraction_Review-g1022838-d1548549-Reviews-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html" #最初のページ
546
-
547
- url = "https://www.tripadvisor.com/Attraction_Review-g1022838-d1548549-Reviews-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
548
-
549
- links = Get_nextpage_url_jp(url)
550
-
551
- print(links)
552
-
553
- """
554
-
555
-
556
-
557
- """
558
-
559
- #debug Get_review_url
560
-
561
- # レビューの詳細ページ取得 2
562
-
563
- #url = "https://www.tripadvisor.jp/Attraction_Review-g298562-d1384635-Reviews-Suma_Rikyu_Park-Kobe_Hyogo_Prefecture_Kinki.html"
564
-
565
- url = "https://www.tripadvisor.com/Attraction_Review-g1022838-d1548549-Reviews-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
566
-
567
- links = Get_review_url(url)
568
-
569
- print(links)
570
-
571
- """
572
-
573
-
574
-
575
- """
576
-
577
- #debug Get_review_data
578
-
579
- # review 取得 3
580
-
581
- #url = "https://www.tripadvisor.jp/ShowUserReviews-g298562-d1384635-r631487360-Suma_Rikyu_Park-Kobe_Hyogo_Prefecture_Kinki.html"
582
-
583
- #url = "https://www.tripadvisor.jp/ShowUserReviews-g1022838-d1548549-r631449250-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
584
-
585
- url = "https://www.tripadvisor.com/ShowUserReviews-g1022838-d1548549-r536077279-Takosenbei_no_Sato-Awaji_Awaji_shima_Hyogo_Prefecture_Kinki.html"
586
-
587
- title, review = Get_review_data(url)
588
-
589
- print("{}\s{}".format(title,review))
590
-
591
- """
592
-
593
-
594
-
595
- #"""
596
-
597
- ###手打ち###
598
-
599
- url_jp = "https://www.tripadvisor.jp/Attraction_Review-g1121309-d5017432-Reviews-Amarube_Railroad_Bridge_Sorano_Eki-Kami_cho_Mikata_gun_Hyogo_Prefecture_Kinki.html"
600
-
601
- ############
602
-
603
-
604
-
605
- url_en = url_jp.replace("https://www.tripadvisor.jp/", "https://www.tripadvisor.com/")
606
-
607
-
608
-
609
- Start_extract_review_jp(url_jp)
610
-
611
- Start_extract_review_en(url_en)
612
-
613
- #"""
614
154
 
615
155
 
616
156