質問編集履歴
4
表の追加
test
CHANGED
File without changes
|
test
CHANGED
@@ -410,6 +410,22 @@
|
|
410
410
|
|
411
411
|
|
412
412
|
|
413
|
+
CSVファイルではこういう風に出てきました。
|
414
|
+
|
415
|
+
|
416
|
+
|
417
|
+
|
418
|
+
|
419
|
+
![![イメージ説明](680d7e72401214c6ea6c8d809472850d.png)](17a78b7dd10e37441d3197967c5cf92a.png)
|
420
|
+
|
421
|
+
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
|
426
|
+
|
427
|
+
|
428
|
+
|
413
429
|
|
414
430
|
|
415
431
|
python version: 3.7.1
|
3
うまく行かなかったところを説明しました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
### 前提・実現したいこと
|
2
2
|
|
3
|
-
https://github.com/ShoKosaka/Suumo
|
3
|
+
https://github.com/ShoKosaka/Suumo/blob/master/Scraping.ipynb
|
4
|
-
|
4
|
+
|
5
|
-
を参照して、summoの家賃情報取得したいですが、
|
5
|
+
を参照して、summoの家賃情報取得したいですが、生成したCSVファイルでは、賃料、管理費、間取りなどの情報が出てこなかったです。なぜでしょうか。
|
6
6
|
|
7
7
|
|
8
8
|
|
@@ -394,7 +394,7 @@
|
|
394
394
|
|
395
395
|
#csvファイルとして保存
|
396
396
|
|
397
|
-
suumo_df.to_csv('suumo_a
|
397
|
+
suumo_df.to_csv('suumo_minato.csv', sep = '\t',encoding='utf-16')
|
398
398
|
|
399
399
|
```
|
400
400
|
|
@@ -410,4 +410,6 @@
|
|
410
410
|
|
411
411
|
|
412
412
|
|
413
|
+
|
414
|
+
|
413
415
|
python version: 3.7.1
|
2
全文が張り出されました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -8,403 +8,403 @@
|
|
8
8
|
|
9
9
|
|
10
10
|
|
11
|
-
###
|
11
|
+
### 一応コード全文張り出します。
|
12
12
|
|
13
13
|
|
14
14
|
|
15
15
|
```
|
16
16
|
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
from bs4 import BeautifulSoup
|
22
|
+
|
23
|
+
import requests
|
24
|
+
|
25
|
+
import pandas as pd
|
26
|
+
|
27
|
+
from pandas import Series, DataFrame
|
28
|
+
|
29
|
+
import time
|
30
|
+
|
31
|
+
import urllib
|
32
|
+
|
33
|
+
import urllib.request
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
from urllib.request import urlopen
|
38
|
+
|
39
|
+
from bs4 import BeautifulSoup
|
40
|
+
|
41
|
+
from urllib.request import urlopen
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
headers = {"user-agent":"Mozilla/5.0"}
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
url = 'https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&ta=13&sc=13103&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1'
|
50
|
+
|
51
|
+
result = requests.get(url,headers=headers)
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
#HTMLを元に、オブジェクトを作る
|
64
|
+
|
65
|
+
soup = BeautifulSoup(result.content,'html.parser')
|
66
|
+
|
67
|
+
print(soup.title)
|
68
|
+
|
69
|
+
#物件リストの部分を切り出し
|
70
|
+
|
71
|
+
summary = soup.find("div",{'id':'js-bukkenList'})
|
72
|
+
|
73
|
+
|
74
|
+
|
17
|
-
|
75
|
+
#ページ数を取得
|
76
|
+
|
77
|
+
body = soup.find("body")
|
78
|
+
|
79
|
+
pages = body.find_all("div",{'class':'pagination pagination_set-nav'})
|
80
|
+
|
81
|
+
pages_text = str(pages)
|
82
|
+
|
83
|
+
pages_split = pages_text.split('</a></li>\n</ol>')
|
84
|
+
|
85
|
+
pages_split0 = pages_split[0]
|
86
|
+
|
87
|
+
pages_split1 = pages_split0[-3:]
|
88
|
+
|
89
|
+
pages_split2 = pages_split1.replace('>','')
|
90
|
+
|
91
|
+
pages_split3 = int(pages_split2)
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
#URLを入れるリスト
|
96
|
+
|
97
|
+
urls = []
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
#1ページ目を格納
|
102
|
+
|
103
|
+
urls.append(url)
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
#2ページ目から最後のページまでを格納
|
108
|
+
|
109
|
+
for i in range(pages_split3-1):
|
110
|
+
|
111
|
+
pg = str(i+2)
|
112
|
+
|
113
|
+
url_page = url + '&pn=' + pg
|
114
|
+
|
115
|
+
urls.append(url_page)
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
name = [] #マンション名
|
120
|
+
|
121
|
+
address = [] #住所
|
122
|
+
|
123
|
+
locations0 = [] #立地1つ目(最寄駅/徒歩~分)
|
124
|
+
|
125
|
+
locations1 = [] #立地2つ目(最寄駅/徒歩~分)
|
126
|
+
|
127
|
+
locations2 = [] #立地3つ目(最寄駅/徒歩~分)
|
128
|
+
|
129
|
+
age = [] #築年数
|
130
|
+
|
131
|
+
height = [] #建物高さ
|
132
|
+
|
133
|
+
floor = [] #階
|
134
|
+
|
135
|
+
rent = [] #賃料
|
136
|
+
|
137
|
+
admin = [] #管理費
|
138
|
+
|
139
|
+
others = [] #敷/礼/保証/敷引,償却
|
140
|
+
|
141
|
+
floor_plan = [] #間取り
|
142
|
+
|
143
|
+
area = [] #専有面積
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
#各ページで以下の動作をループ
|
148
|
+
|
149
|
+
for url in urls:
|
150
|
+
|
151
|
+
#物件リストを切り出し
|
152
|
+
|
153
|
+
c = urllib.request.urlopen(url)
|
154
|
+
|
155
|
+
soup = BeautifulSoup(c,'html.parser')
|
156
|
+
|
157
|
+
summary = soup.find("div",{'id':'js-bukkenList'})
|
158
|
+
|
159
|
+
|
160
|
+
|
161
|
+
#マンション名、住所、立地(最寄駅/徒歩~分)、築年数、建物高さが入っているcassetteitemを全て抜き出し
|
162
|
+
|
163
|
+
cassetteitems = summary.find_all("div",{'class':'cassetteitem'})
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
#各cassetteitemsに対し、以下の動作をループ
|
168
|
+
|
169
|
+
for i in range(len(cassetteitems)):
|
170
|
+
|
171
|
+
#各建物から売りに出ている部屋数を取得
|
172
|
+
|
173
|
+
tbodies = cassetteitems[i].find_all('tbody')
|
174
|
+
|
175
|
+
|
176
|
+
|
177
|
+
#マンション名取得
|
178
|
+
|
179
|
+
subtitle = cassetteitems[i].find_all("div",{
|
180
|
+
|
181
|
+
'class':'cassetteitem_content-title'})
|
182
|
+
|
183
|
+
subtitle = str(subtitle)
|
184
|
+
|
185
|
+
subtitle_rep = subtitle.replace(
|
186
|
+
|
187
|
+
'[<div class="cassetteitem_content-title">', '')
|
188
|
+
|
189
|
+
subtitle_rep2 = subtitle_rep.replace(
|
190
|
+
|
191
|
+
'</div>]', '')
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
#住所取得
|
196
|
+
|
197
|
+
subaddress = cassetteitems[i].find_all("li",{
|
198
|
+
|
199
|
+
'class':'cassetteitem_detail-col1'})
|
200
|
+
|
201
|
+
subaddress = str(subaddress)
|
202
|
+
|
203
|
+
subaddress_rep = subaddress.replace(
|
204
|
+
|
205
|
+
'[<li class="cassetteitem_detail-col1">', '')
|
206
|
+
|
207
|
+
subaddress_rep2 = subaddress_rep.replace(
|
208
|
+
|
209
|
+
'</li>]', '')
|
210
|
+
|
211
|
+
|
212
|
+
|
213
|
+
#部屋数だけ、マンション名と住所を繰り返しリストに格納(部屋情報と数を合致させるため)
|
214
|
+
|
215
|
+
for y in range(len(tbodies)):
|
216
|
+
|
217
|
+
name.append(subtitle_rep2)
|
218
|
+
|
219
|
+
address.append(subaddress_rep2)
|
220
|
+
|
221
|
+
|
222
|
+
|
223
|
+
#立地を取得
|
224
|
+
|
225
|
+
sublocations = cassetteitems[i].find_all("li",{
|
226
|
+
|
227
|
+
'class':'cassetteitem_detail-col2'})
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
#立地は、1つ目から3つ目までを取得(4つ目以降は無視)
|
232
|
+
|
233
|
+
for x in sublocations:
|
234
|
+
|
235
|
+
cols = x.find_all('div')
|
236
|
+
|
237
|
+
for i in range(len(cols)):
|
238
|
+
|
239
|
+
text = cols[i].find(text=True)
|
240
|
+
|
241
|
+
for y in range(len(tbodies)):
|
242
|
+
|
243
|
+
if i == 0:
|
244
|
+
|
245
|
+
locations0.append(text)
|
246
|
+
|
247
|
+
elif i == 1:
|
248
|
+
|
249
|
+
locations1.append(text)
|
250
|
+
|
251
|
+
elif i == 2:
|
252
|
+
|
253
|
+
locations2.append(text)
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
#築年数と建物高さを取得
|
258
|
+
|
259
|
+
tbodies = cassetteitems[i].find_all('tbody')
|
260
|
+
|
261
|
+
col3 = cassetteitems[i].find_all("li",{
|
262
|
+
|
263
|
+
'class':'cassetteitem_detail-col3'})
|
264
|
+
|
265
|
+
for x in col3:
|
266
|
+
|
267
|
+
cols = x.find_all('div')
|
268
|
+
|
269
|
+
for i in range(len(cols)):
|
270
|
+
|
271
|
+
text = cols[i].find(text=True)
|
272
|
+
|
273
|
+
for y in range(len(tbodies)):
|
274
|
+
|
275
|
+
if i == 0:
|
276
|
+
|
277
|
+
age.append(text)
|
278
|
+
|
279
|
+
else:
|
280
|
+
|
281
|
+
height.append(text)
|
282
|
+
|
283
|
+
|
284
|
+
|
285
|
+
#階、賃料、管理費、敷/礼/保証/敷引,償却、間取り、専有面積が入っているtableを全て抜き出し
|
286
|
+
|
287
|
+
tables = summary.find_all('table')
|
288
|
+
|
289
|
+
|
290
|
+
|
291
|
+
#各建物(table)に対して、売りに出ている部屋(row)を取得
|
292
|
+
|
293
|
+
rows = []
|
294
|
+
|
295
|
+
for i in range(len(tables)):
|
296
|
+
|
297
|
+
rows.append(tables[i].find_all('tr'))
|
298
|
+
|
299
|
+
|
300
|
+
|
301
|
+
#各部屋に対して、tableに入っているtext情報を取得し、dataリストに格納
|
302
|
+
|
303
|
+
data = []
|
304
|
+
|
305
|
+
for row in rows:
|
306
|
+
|
307
|
+
for tr in row:
|
308
|
+
|
309
|
+
cols = tr.find_all('td')
|
310
|
+
|
311
|
+
for td in cols:
|
312
|
+
|
313
|
+
text = td.find(text=True)
|
314
|
+
|
315
|
+
data.append(text)
|
316
|
+
|
317
|
+
|
318
|
+
|
319
|
+
#dataリストから、階、賃料、管理費、敷/礼/保証/敷引,償却、間取り、専有面積を順番に取り出す
|
320
|
+
|
321
|
+
index = 0
|
322
|
+
|
323
|
+
for item in data:
|
324
|
+
|
325
|
+
if '階' in item:
|
326
|
+
|
327
|
+
floor.append(data[index])
|
328
|
+
|
329
|
+
rent.append(data[index+1])
|
330
|
+
|
331
|
+
admin.append(data[index+2])
|
332
|
+
|
333
|
+
others.append(data[index+3])
|
334
|
+
|
335
|
+
floor_plan.append(data[index+4])
|
336
|
+
|
337
|
+
area.append(data[index+5])
|
338
|
+
|
339
|
+
index +=1
|
340
|
+
|
341
|
+
|
342
|
+
|
343
|
+
#プログラムを10秒間停止する(スクレイピングマナー)
|
344
|
+
|
345
|
+
time.sleep(10)
|
346
|
+
|
347
|
+
|
348
|
+
|
349
|
+
#各リストをシリーズ化
|
350
|
+
|
351
|
+
name = Series(name)
|
352
|
+
|
353
|
+
address = Series(address)
|
354
|
+
|
355
|
+
locations0 = Series(locations0)
|
356
|
+
|
357
|
+
locations1 = Series(locations1)
|
358
|
+
|
359
|
+
locations2 = Series(locations2)
|
360
|
+
|
361
|
+
age = Series(age)
|
362
|
+
|
363
|
+
height = Series(height)
|
364
|
+
|
365
|
+
floor = Series(floor)
|
366
|
+
|
367
|
+
rent = Series(rent)
|
368
|
+
|
369
|
+
admin = Series(admin)
|
370
|
+
|
371
|
+
others = Series(others)
|
372
|
+
|
373
|
+
floor_plan = Series(floor_plan)
|
374
|
+
|
375
|
+
area = Series(area)
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
#各シリーズをデータフレーム化
|
380
|
+
|
381
|
+
suumo_df = pd.concat([name, address, locations0, locations1,
|
382
|
+
|
383
|
+
locations2, age, height,floor,rent,admin,others,floor_plan,area],axis=1)
|
384
|
+
|
385
|
+
|
386
|
+
|
387
|
+
#カラム名
|
388
|
+
|
389
|
+
suumo_df.columns=['マンション名','住所','立地1','立地2','立地3','築年数','建物高さ','階','賃料','管理費',
|
390
|
+
|
391
|
+
'敷/礼/保証/敷引,償却','間取り','専有面積']
|
392
|
+
|
393
|
+
|
394
|
+
|
395
|
+
#csvファイルとして保存
|
396
|
+
|
397
|
+
suumo_df.to_csv('suumo_adachi.csv', sep = '\t',encoding='utf-16')
|
18
398
|
|
19
399
|
```
|
20
400
|
|
21
401
|
|
22
402
|
|
23
|
-
|
403
|
+
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
404
|
+
|
32
|
-
|
33
|
-
|
405
|
+
|
34
|
-
|
35
|
-
|
406
|
+
|
36
|
-
|
37
|
-
|
407
|
+
|
38
|
-
|
39
|
-
import time
|
40
|
-
|
41
|
-
import urllib
|
42
|
-
|
43
|
-
import urllib.request
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
from urllib.request import urlopen
|
48
|
-
|
49
|
-
from bs4 import BeautifulSoup
|
50
|
-
|
51
|
-
from urllib.request import urlopen
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
headers = {"user-agent":"Mozilla/5.0"}
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
url = 'https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&ta=13&sc=13103&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1'
|
60
|
-
|
61
|
-
result = requests.get(url,headers=headers)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
soup = BeautifulSoup(result.content,'html.parser')
|
76
|
-
|
77
|
-
print(soup.title)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
summary = soup.find("div",{'id':'js-bukkenList'})
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
body = soup.find("body")
|
88
|
-
|
89
|
-
pages = body.find_all("div",{'class':'pagination pagination_set-nav'})
|
90
|
-
|
91
|
-
pages_text = str(pages)
|
92
|
-
|
93
|
-
pages_split = pages_text.split('</a></li>\n</ol>')
|
94
|
-
|
95
|
-
pages_split0 = pages_split[0]
|
96
|
-
|
97
|
-
pages_split1 = pages_split0[-3:]
|
98
|
-
|
99
|
-
pages_split2 = pages_split1.replace('>','')
|
100
|
-
|
101
|
-
pages_split3 = int(pages_split2)
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
urls = []
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
urls.append(url)
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
for i in range(pages_split3-1):
|
120
|
-
|
121
|
-
pg = str(i+2)
|
122
|
-
|
123
|
-
url_page = url + '&pn=' + pg
|
124
|
-
|
125
|
-
urls.append(url_page)
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
name = []
|
130
|
-
|
131
|
-
address = []
|
132
|
-
|
133
|
-
locations0 = []
|
134
|
-
|
135
|
-
locations1 = []
|
136
|
-
|
137
|
-
locations2 = []
|
138
|
-
|
139
|
-
age = []
|
140
|
-
|
141
|
-
height = []
|
142
|
-
|
143
|
-
floor = []
|
144
|
-
|
145
|
-
rent = []
|
146
|
-
|
147
|
-
admin = []
|
148
|
-
|
149
|
-
others = []
|
150
|
-
|
151
|
-
floor_plan = []
|
152
|
-
|
153
|
-
area = []
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
for url in urls:
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
c = urllib.request.urlopen(url)
|
164
|
-
|
165
|
-
soup = BeautifulSoup(c,'html.parser')
|
166
|
-
|
167
|
-
summary = soup.find("div",{'id':'js-bukkenList'})
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
cassetteitems = summary.find_all("div",{'class':'cassetteitem'})
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
for i in range(len(cassetteitems)):
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
tbodies = cassetteitems[i].find_all('tbody')
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
subtitle = cassetteitems[i].find_all("div",{
|
190
|
-
|
191
|
-
'class':'cassetteitem_content-title'})
|
192
|
-
|
193
|
-
subtitle = str(subtitle)
|
194
|
-
|
195
|
-
subtitle_rep = subtitle.replace(
|
196
|
-
|
197
|
-
'[<div class="cassetteitem_content-title">', '')
|
198
|
-
|
199
|
-
subtitle_rep2 = subtitle_rep.replace(
|
200
|
-
|
201
|
-
'</div>]', '')
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
subaddress = cassetteitems[i].find_all("li",{
|
208
|
-
|
209
|
-
'class':'cassetteitem_detail-col1'})
|
210
|
-
|
211
|
-
subaddress = str(subaddress)
|
212
|
-
|
213
|
-
subaddress_rep = subaddress.replace(
|
214
|
-
|
215
|
-
'[<li class="cassetteitem_detail-col1">', '')
|
216
|
-
|
217
|
-
subaddress_rep2 = subaddress_rep.replace(
|
218
|
-
|
219
|
-
'</li>]', '')
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
for y in range(len(tbodies)):
|
226
|
-
|
227
|
-
name.append(subtitle_rep2)
|
228
|
-
|
229
|
-
address.append(subaddress_rep2)
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
sublocations = cassetteitems[i].find_all("li",{
|
236
|
-
|
237
|
-
'class':'cassetteitem_detail-col2'})
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
for x in sublocations:
|
244
|
-
|
245
|
-
cols = x.find_all('div')
|
246
|
-
|
247
|
-
for i in range(len(cols)):
|
248
|
-
|
249
|
-
text = cols[i].find(text=True)
|
250
|
-
|
251
|
-
for y in range(len(tbodies)):
|
252
|
-
|
253
|
-
if i == 0:
|
254
|
-
|
255
|
-
locations0.append(text)
|
256
|
-
|
257
|
-
elif i == 1:
|
258
|
-
|
259
|
-
locations1.append(text)
|
260
|
-
|
261
|
-
elif i == 2:
|
262
|
-
|
263
|
-
locations2.append(text)
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
tbodies = cassetteitems[i].find_all('tbody')
|
270
|
-
|
271
|
-
col3 = cassetteitems[i].find_all("li",{
|
272
|
-
|
273
|
-
'class':'cassetteitem_detail-col3'})
|
274
|
-
|
275
|
-
for x in col3:
|
276
|
-
|
277
|
-
cols = x.find_all('div')
|
278
|
-
|
279
|
-
for i in range(len(cols)):
|
280
|
-
|
281
|
-
text = cols[i].find(text=True)
|
282
|
-
|
283
|
-
for y in range(len(tbodies)):
|
284
|
-
|
285
|
-
if i == 0:
|
286
|
-
|
287
|
-
age.append(text)
|
288
|
-
|
289
|
-
else:
|
290
|
-
|
291
|
-
height.append(text)
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
tables = summary.find_all('table')
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
rows = []
|
304
|
-
|
305
|
-
for i in range(len(tables)):
|
306
|
-
|
307
|
-
rows.append(tables[i].find_all('tr'))
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
data = []
|
314
|
-
|
315
|
-
for row in rows:
|
316
|
-
|
317
|
-
for tr in row:
|
318
|
-
|
319
|
-
cols = tr.find_all('td')
|
320
|
-
|
321
|
-
for td in cols:
|
322
|
-
|
323
|
-
text = td.find(text=True)
|
324
|
-
|
325
|
-
data.append(text)
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
index = 0
|
332
|
-
|
333
|
-
for item in data:
|
334
|
-
|
335
|
-
if '階' in item:
|
336
|
-
|
337
|
-
floor.append(data[index])
|
338
|
-
|
339
|
-
rent.append(data[index+1])
|
340
|
-
|
341
|
-
admin.append(data[index+2])
|
342
|
-
|
343
|
-
others.append(data[index+3])
|
344
|
-
|
345
|
-
floor_plan.append(data[index+4])
|
346
|
-
|
347
|
-
area.append(data[index+5])
|
348
|
-
|
349
|
-
index +=1
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
time.sleep(10)
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
name = Series(name)
|
362
|
-
|
363
|
-
address = Series(address)
|
364
|
-
|
365
|
-
locations0 = Series(locations0)
|
366
|
-
|
367
|
-
locations1 = Series(locations1)
|
368
|
-
|
369
|
-
locations2 = Series(locations2)
|
370
|
-
|
371
|
-
age = Series(age)
|
372
|
-
|
373
|
-
height = Series(height)
|
374
|
-
|
375
|
-
floor = Series(floor)
|
376
|
-
|
377
|
-
rent = Series(rent)
|
378
|
-
|
379
|
-
admin = Series(admin)
|
380
|
-
|
381
|
-
others = Series(others)
|
382
|
-
|
383
|
-
floor_plan = Series(floor_plan)
|
384
|
-
|
385
|
-
area = Series(area)
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
suumo_df = pd.concat([name, address, locations0, locations1,
|
392
|
-
|
393
|
-
locations2, age, height,floor,rent,admin,others,floor_plan,area],axis=1)
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
suumo_df.columns=['マンション名','住所','立地1','立地2','立地3','築年数','建物高さ','階','賃料','管理費',
|
400
|
-
|
401
|
-
'敷/礼/保証/敷引,償却','間取り','専有面積']
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
suumo_df.to_csv('suumo_adachi.csv', sep = '\t',encoding='utf-16')
|
408
408
|
|
409
409
|
|
410
410
|
|
1
いらない内容を消しました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -26,7 +26,7 @@
|
|
26
26
|
|
27
27
|
|
28
28
|
|
29
|
-
|
29
|
+
|
30
30
|
|
31
31
|
from bs4 import BeautifulSoup
|
32
32
|
|
@@ -70,19 +70,19 @@
|
|
70
70
|
|
71
71
|
|
72
72
|
|
73
|
-
|
73
|
+
|
74
74
|
|
75
75
|
soup = BeautifulSoup(result.content,'html.parser')
|
76
76
|
|
77
77
|
print(soup.title)
|
78
78
|
|
79
|
-
|
79
|
+
|
80
80
|
|
81
81
|
summary = soup.find("div",{'id':'js-bukkenList'})
|
82
82
|
|
83
83
|
|
84
84
|
|
85
|
-
|
85
|
+
|
86
86
|
|
87
87
|
body = soup.find("body")
|
88
88
|
|
@@ -102,19 +102,19 @@
|
|
102
102
|
|
103
103
|
|
104
104
|
|
105
|
-
|
105
|
+
|
106
106
|
|
107
107
|
urls = []
|
108
108
|
|
109
109
|
|
110
110
|
|
111
|
-
|
111
|
+
|
112
112
|
|
113
113
|
urls.append(url)
|
114
114
|
|
115
115
|
|
116
116
|
|
117
|
-
|
117
|
+
|
118
118
|
|
119
119
|
for i in range(pages_split3-1):
|
120
120
|
|
@@ -126,39 +126,39 @@
|
|
126
126
|
|
127
127
|
|
128
128
|
|
129
|
-
name = []
|
129
|
+
name = []
|
130
|
-
|
130
|
+
|
131
|
-
address = []
|
131
|
+
address = []
|
132
|
-
|
132
|
+
|
133
|
-
locations0 = []
|
133
|
+
locations0 = []
|
134
|
-
|
134
|
+
|
135
|
-
locations1 = []
|
135
|
+
locations1 = []
|
136
|
-
|
136
|
+
|
137
|
-
locations2 = []
|
137
|
+
locations2 = []
|
138
|
-
|
138
|
+
|
139
|
-
age = []
|
139
|
+
age = []
|
140
|
-
|
140
|
+
|
141
|
-
height = []
|
141
|
+
height = []
|
142
|
-
|
142
|
+
|
143
|
-
floor = []
|
143
|
+
floor = []
|
144
|
-
|
144
|
+
|
145
|
-
rent = []
|
145
|
+
rent = []
|
146
|
-
|
146
|
+
|
147
|
-
admin = []
|
147
|
+
admin = []
|
148
|
-
|
148
|
+
|
149
|
-
others = []
|
149
|
+
others = []
|
150
|
-
|
150
|
+
|
151
|
-
floor_plan = []
|
151
|
+
floor_plan = []
|
152
|
-
|
152
|
+
|
153
|
-
area = []
|
153
|
+
area = []
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
154
|
+
|
155
|
+
|
156
|
+
|
157
|
+
|
158
158
|
|
159
159
|
for url in urls:
|
160
160
|
|
161
|
-
|
161
|
+
|
162
162
|
|
163
163
|
c = urllib.request.urlopen(url)
|
164
164
|
|
@@ -168,23 +168,23 @@
|
|
168
168
|
|
169
169
|
|
170
170
|
|
171
|
-
|
171
|
+
|
172
172
|
|
173
173
|
cassetteitems = summary.find_all("div",{'class':'cassetteitem'})
|
174
174
|
|
175
175
|
|
176
176
|
|
177
|
-
|
177
|
+
|
178
178
|
|
179
179
|
for i in range(len(cassetteitems)):
|
180
180
|
|
181
|
-
|
181
|
+
|
182
182
|
|
183
183
|
tbodies = cassetteitems[i].find_all('tbody')
|
184
184
|
|
185
185
|
|
186
186
|
|
187
|
-
|
187
|
+
|
188
188
|
|
189
189
|
subtitle = cassetteitems[i].find_all("div",{
|
190
190
|
|
@@ -202,7 +202,7 @@
|
|
202
202
|
|
203
203
|
|
204
204
|
|
205
|
-
|
205
|
+
|
206
206
|
|
207
207
|
subaddress = cassetteitems[i].find_all("li",{
|
208
208
|
|
@@ -220,7 +220,7 @@
|
|
220
220
|
|
221
221
|
|
222
222
|
|
223
|
-
|
223
|
+
|
224
224
|
|
225
225
|
for y in range(len(tbodies)):
|
226
226
|
|
@@ -230,7 +230,7 @@
|
|
230
230
|
|
231
231
|
|
232
232
|
|
233
|
-
|
233
|
+
|
234
234
|
|
235
235
|
sublocations = cassetteitems[i].find_all("li",{
|
236
236
|
|
@@ -238,7 +238,7 @@
|
|
238
238
|
|
239
239
|
|
240
240
|
|
241
|
-
|
241
|
+
|
242
242
|
|
243
243
|
for x in sublocations:
|
244
244
|
|
@@ -264,7 +264,7 @@
|
|
264
264
|
|
265
265
|
|
266
266
|
|
267
|
-
|
267
|
+
|
268
268
|
|
269
269
|
tbodies = cassetteitems[i].find_all('tbody')
|
270
270
|
|
@@ -292,13 +292,13 @@
|
|
292
292
|
|
293
293
|
|
294
294
|
|
295
|
-
|
295
|
+
|
296
296
|
|
297
297
|
tables = summary.find_all('table')
|
298
298
|
|
299
299
|
|
300
300
|
|
301
|
-
|
301
|
+
|
302
302
|
|
303
303
|
rows = []
|
304
304
|
|
@@ -308,7 +308,7 @@
|
|
308
308
|
|
309
309
|
|
310
310
|
|
311
|
-
|
311
|
+
|
312
312
|
|
313
313
|
data = []
|
314
314
|
|
@@ -326,7 +326,7 @@
|
|
326
326
|
|
327
327
|
|
328
328
|
|
329
|
-
|
329
|
+
|
330
330
|
|
331
331
|
index = 0
|
332
332
|
|
@@ -350,13 +350,13 @@
|
|
350
350
|
|
351
351
|
|
352
352
|
|
353
|
-
|
353
|
+
|
354
354
|
|
355
355
|
time.sleep(10)
|
356
356
|
|
357
357
|
|
358
358
|
|
359
|
-
|
359
|
+
|
360
360
|
|
361
361
|
name = Series(name)
|
362
362
|
|
@@ -386,7 +386,7 @@
|
|
386
386
|
|
387
387
|
|
388
388
|
|
389
|
-
|
389
|
+
|
390
390
|
|
391
391
|
suumo_df = pd.concat([name, address, locations0, locations1,
|
392
392
|
|
@@ -394,7 +394,7 @@
|
|
394
394
|
|
395
395
|
|
396
396
|
|
397
|
-
|
397
|
+
|
398
398
|
|
399
399
|
suumo_df.columns=['マンション名','住所','立地1','立地2','立地3','築年数','建物高さ','階','賃料','管理費',
|
400
400
|
|
@@ -402,7 +402,7 @@
|
|
402
402
|
|
403
403
|
|
404
404
|
|
405
|
-
|
405
|
+
|
406
406
|
|
407
407
|
suumo_df.to_csv('suumo_adachi.csv', sep = '\t',encoding='utf-16')
|
408
408
|
|