質問編集履歴
4
表の追加
title
CHANGED
File without changes
|
body
CHANGED
@@ -204,5 +204,13 @@
|
|
204
204
|
|
205
205
|
|
206
206
|
|
207
|
+
CSVファイルではこういう風に出てきました。
|
207
208
|
|
209
|
+
|
210
|
+
](17a78b7dd10e37441d3197967c5cf92a.png)
|
211
|
+
|
212
|
+
|
213
|
+
|
214
|
+
|
215
|
+
|
208
216
|
python version: 3.7.1
|
3
うまく行かなかったところを説明しました。
title
CHANGED
File without changes
|
body
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
### 前提・実現したいこと
|
2
|
-
https://github.com/ShoKosaka/Suumo
|
2
|
+
https://github.com/ShoKosaka/Suumo/blob/master/Scraping.ipynb
|
3
|
-
を参照して、summoの家賃情報取得したいですが、
|
3
|
+
を参照して、summoの家賃情報取得したいですが、生成したCSVファイルでは、賃料、管理費、間取りなどの情報が出てこなかったです。なぜでしょうか。
|
4
4
|
|
5
5
|
|
6
6
|
### 一応コード全文張り出します。
|
@@ -196,7 +196,7 @@
|
|
196
196
|
'敷/礼/保証/敷引,償却','間取り','専有面積']
|
197
197
|
|
198
198
|
#csvファイルとして保存
|
199
|
-
suumo_df.to_csv('
|
199
|
+
suumo_df.to_csv('suumo_minato.csv', sep = '\t',encoding='utf-16')
|
200
200
|
```
|
201
201
|
|
202
202
|
|
@@ -204,4 +204,5 @@
|
|
204
204
|
|
205
205
|
|
206
206
|
|
207
|
+
|
207
208
|
python version: 3.7.1
|
2
全文が張り出されました。
title
CHANGED
File without changes
|
body
CHANGED
@@ -3,16 +3,11 @@
|
|
3
3
|
を参照して、summoの家賃情報取得したいですが、うまく行かなかったです。賃料、管理費などの情報が出てこなかったです。なぜでしょうか。
|
4
4
|
|
5
5
|
|
6
|
-
###
|
6
|
+
### 一応コード全文張り出します。
|
7
7
|
|
8
8
|
```
|
9
|
-
エラーメッセージ
|
10
|
-
```
|
11
9
|
|
12
|
-
### 該当のソースコード
|
13
10
|
|
14
|
-
|
15
|
-
|
16
11
|
from bs4 import BeautifulSoup
|
17
12
|
import requests
|
18
13
|
import pandas as pd
|
@@ -34,13 +29,13 @@
|
|
34
29
|
|
35
30
|
|
36
31
|
|
37
|
-
|
32
|
+
#HTMLを元に、オブジェクトを作る
|
38
33
|
soup = BeautifulSoup(result.content,'html.parser')
|
39
34
|
print(soup.title)
|
40
|
-
|
35
|
+
#物件リストの部分を切り出し
|
41
36
|
summary = soup.find("div",{'id':'js-bukkenList'})
|
42
37
|
|
43
|
-
|
38
|
+
#ページ数を取得
|
44
39
|
body = soup.find("body")
|
45
40
|
pages = body.find_all("div",{'class':'pagination pagination_set-nav'})
|
46
41
|
pages_text = str(pages)
|
@@ -50,48 +45,48 @@
|
|
50
45
|
pages_split2 = pages_split1.replace('>','')
|
51
46
|
pages_split3 = int(pages_split2)
|
52
47
|
|
53
|
-
|
48
|
+
#URLを入れるリスト
|
54
49
|
urls = []
|
55
50
|
|
56
|
-
|
51
|
+
#1ページ目を格納
|
57
52
|
urls.append(url)
|
58
53
|
|
59
|
-
|
54
|
+
#2ページ目から最後のページまでを格納
|
60
55
|
for i in range(pages_split3-1):
|
61
56
|
pg = str(i+2)
|
62
57
|
url_page = url + '&pn=' + pg
|
63
58
|
urls.append(url_page)
|
64
59
|
|
65
|
-
name = []
|
60
|
+
name = [] #マンション名
|
66
|
-
address = []
|
61
|
+
address = [] #住所
|
67
|
-
locations0 = []
|
62
|
+
locations0 = [] #立地1つ目(最寄駅/徒歩~分)
|
68
|
-
locations1 = []
|
63
|
+
locations1 = [] #立地2つ目(最寄駅/徒歩~分)
|
69
|
-
locations2 = []
|
64
|
+
locations2 = [] #立地3つ目(最寄駅/徒歩~分)
|
70
|
-
age = []
|
65
|
+
age = [] #築年数
|
71
|
-
height = []
|
66
|
+
height = [] #建物高さ
|
72
|
-
floor = []
|
67
|
+
floor = [] #階
|
73
|
-
rent = []
|
68
|
+
rent = [] #賃料
|
74
|
-
admin = []
|
69
|
+
admin = [] #管理費
|
75
|
-
others = []
|
70
|
+
others = [] #敷/礼/保証/敷引,償却
|
76
|
-
floor_plan = []
|
71
|
+
floor_plan = [] #間取り
|
77
|
-
area = []
|
72
|
+
area = [] #専有面積
|
78
73
|
|
79
|
-
|
74
|
+
#各ページで以下の動作をループ
|
80
75
|
for url in urls:
|
81
|
-
|
76
|
+
#物件リストを切り出し
|
82
77
|
c = urllib.request.urlopen(url)
|
83
78
|
soup = BeautifulSoup(c,'html.parser')
|
84
79
|
summary = soup.find("div",{'id':'js-bukkenList'})
|
85
80
|
|
86
|
-
|
81
|
+
#マンション名、住所、立地(最寄駅/徒歩~分)、築年数、建物高さが入っているcassetteitemを全て抜き出し
|
87
82
|
cassetteitems = summary.find_all("div",{'class':'cassetteitem'})
|
88
83
|
|
89
|
-
|
84
|
+
#各cassetteitemsに対し、以下の動作をループ
|
90
85
|
for i in range(len(cassetteitems)):
|
91
|
-
|
86
|
+
#各建物から売りに出ている部屋数を取得
|
92
87
|
tbodies = cassetteitems[i].find_all('tbody')
|
93
88
|
|
94
|
-
|
89
|
+
#マンション名取得
|
95
90
|
subtitle = cassetteitems[i].find_all("div",{
|
96
91
|
'class':'cassetteitem_content-title'})
|
97
92
|
subtitle = str(subtitle)
|
@@ -100,7 +95,7 @@
|
|
100
95
|
subtitle_rep2 = subtitle_rep.replace(
|
101
96
|
'</div>]', '')
|
102
97
|
|
103
|
-
|
98
|
+
#住所取得
|
104
99
|
subaddress = cassetteitems[i].find_all("li",{
|
105
100
|
'class':'cassetteitem_detail-col1'})
|
106
101
|
subaddress = str(subaddress)
|
@@ -109,16 +104,16 @@
|
|
109
104
|
subaddress_rep2 = subaddress_rep.replace(
|
110
105
|
'</li>]', '')
|
111
106
|
|
112
|
-
|
107
|
+
#部屋数だけ、マンション名と住所を繰り返しリストに格納(部屋情報と数を合致させるため)
|
113
108
|
for y in range(len(tbodies)):
|
114
109
|
name.append(subtitle_rep2)
|
115
110
|
address.append(subaddress_rep2)
|
116
111
|
|
117
|
-
|
112
|
+
#立地を取得
|
118
113
|
sublocations = cassetteitems[i].find_all("li",{
|
119
114
|
'class':'cassetteitem_detail-col2'})
|
120
115
|
|
121
|
-
|
116
|
+
#立地は、1つ目から3つ目までを取得(4つ目以降は無視)
|
122
117
|
for x in sublocations:
|
123
118
|
cols = x.find_all('div')
|
124
119
|
for i in range(len(cols)):
|
@@ -131,7 +126,7 @@
|
|
131
126
|
elif i == 2:
|
132
127
|
locations2.append(text)
|
133
128
|
|
134
|
-
|
129
|
+
#築年数と建物高さを取得
|
135
130
|
tbodies = cassetteitems[i].find_all('tbody')
|
136
131
|
col3 = cassetteitems[i].find_all("li",{
|
137
132
|
'class':'cassetteitem_detail-col3'})
|
@@ -145,15 +140,15 @@
|
|
145
140
|
else:
|
146
141
|
height.append(text)
|
147
142
|
|
148
|
-
|
143
|
+
#階、賃料、管理費、敷/礼/保証/敷引,償却、間取り、専有面積が入っているtableを全て抜き出し
|
149
144
|
tables = summary.find_all('table')
|
150
145
|
|
151
|
-
|
146
|
+
#各建物(table)に対して、売りに出ている部屋(row)を取得
|
152
147
|
rows = []
|
153
148
|
for i in range(len(tables)):
|
154
149
|
rows.append(tables[i].find_all('tr'))
|
155
150
|
|
156
|
-
|
151
|
+
#各部屋に対して、tableに入っているtext情報を取得し、dataリストに格納
|
157
152
|
data = []
|
158
153
|
for row in rows:
|
159
154
|
for tr in row:
|
@@ -162,7 +157,7 @@
|
|
162
157
|
text = td.find(text=True)
|
163
158
|
data.append(text)
|
164
159
|
|
165
|
-
|
160
|
+
#dataリストから、階、賃料、管理費、敷/礼/保証/敷引,償却、間取り、専有面積を順番に取り出す
|
166
161
|
index = 0
|
167
162
|
for item in data:
|
168
163
|
if '階' in item:
|
@@ -174,10 +169,10 @@
|
|
174
169
|
area.append(data[index+5])
|
175
170
|
index +=1
|
176
171
|
|
177
|
-
|
172
|
+
#プログラムを10秒間停止する(スクレイピングマナー)
|
178
173
|
time.sleep(10)
|
179
174
|
|
180
|
-
|
175
|
+
#各リストをシリーズ化
|
181
176
|
name = Series(name)
|
182
177
|
address = Series(address)
|
183
178
|
locations0 = Series(locations0)
|
@@ -192,16 +187,21 @@
|
|
192
187
|
floor_plan = Series(floor_plan)
|
193
188
|
area = Series(area)
|
194
189
|
|
195
|
-
|
190
|
+
#各シリーズをデータフレーム化
|
196
191
|
suumo_df = pd.concat([name, address, locations0, locations1,
|
197
192
|
locations2, age, height,floor,rent,admin,others,floor_plan,area],axis=1)
|
198
193
|
|
199
|
-
|
194
|
+
#カラム名
|
200
195
|
suumo_df.columns=['マンション名','住所','立地1','立地2','立地3','築年数','建物高さ','階','賃料','管理費',
|
201
196
|
'敷/礼/保証/敷引,償却','間取り','専有面積']
|
202
197
|
|
203
|
-
|
198
|
+
#csvファイルとして保存
|
204
199
|
suumo_df.to_csv('suumo_adachi.csv', sep = '\t',encoding='utf-16')
|
200
|
+
```
|
205
201
|
|
206
202
|
|
203
|
+
|
204
|
+
|
205
|
+
|
206
|
+
|
207
207
|
python version: 3.7.1
|
1
いらない内容を消しました。
title
CHANGED
File without changes
|
body
CHANGED
@@ -12,7 +12,7 @@
|
|
12
12
|
### 該当のソースコード
|
13
13
|
|
14
14
|
|
15
|
-
|
15
|
+
|
16
16
|
from bs4 import BeautifulSoup
|
17
17
|
import requests
|
18
18
|
import pandas as pd
|
@@ -34,13 +34,13 @@
|
|
34
34
|
|
35
35
|
|
36
36
|
|
37
|
-
|
37
|
+
|
38
38
|
soup = BeautifulSoup(result.content,'html.parser')
|
39
39
|
print(soup.title)
|
40
|
-
|
40
|
+
|
41
41
|
summary = soup.find("div",{'id':'js-bukkenList'})
|
42
42
|
|
43
|
-
|
43
|
+
|
44
44
|
body = soup.find("body")
|
45
45
|
pages = body.find_all("div",{'class':'pagination pagination_set-nav'})
|
46
46
|
pages_text = str(pages)
|
@@ -50,48 +50,48 @@
|
|
50
50
|
pages_split2 = pages_split1.replace('>','')
|
51
51
|
pages_split3 = int(pages_split2)
|
52
52
|
|
53
|
-
|
53
|
+
|
54
54
|
urls = []
|
55
55
|
|
56
|
-
|
56
|
+
|
57
57
|
urls.append(url)
|
58
58
|
|
59
|
-
|
59
|
+
|
60
60
|
for i in range(pages_split3-1):
|
61
61
|
pg = str(i+2)
|
62
62
|
url_page = url + '&pn=' + pg
|
63
63
|
urls.append(url_page)
|
64
64
|
|
65
|
-
name = []
|
65
|
+
name = []
|
66
|
-
address = []
|
66
|
+
address = []
|
67
|
-
locations0 = []
|
67
|
+
locations0 = []
|
68
|
-
locations1 = []
|
68
|
+
locations1 = []
|
69
|
-
locations2 = []
|
69
|
+
locations2 = []
|
70
|
-
age = []
|
70
|
+
age = []
|
71
|
-
height = []
|
71
|
+
height = []
|
72
|
-
floor = []
|
72
|
+
floor = []
|
73
|
-
rent = []
|
73
|
+
rent = []
|
74
|
-
admin = []
|
74
|
+
admin = []
|
75
|
-
others = []
|
75
|
+
others = []
|
76
|
-
floor_plan = []
|
76
|
+
floor_plan = []
|
77
|
-
area = []
|
77
|
+
area = []
|
78
78
|
|
79
|
-
|
79
|
+
|
80
80
|
for url in urls:
|
81
|
-
|
81
|
+
|
82
82
|
c = urllib.request.urlopen(url)
|
83
83
|
soup = BeautifulSoup(c,'html.parser')
|
84
84
|
summary = soup.find("div",{'id':'js-bukkenList'})
|
85
85
|
|
86
|
-
|
86
|
+
|
87
87
|
cassetteitems = summary.find_all("div",{'class':'cassetteitem'})
|
88
88
|
|
89
|
-
|
89
|
+
|
90
90
|
for i in range(len(cassetteitems)):
|
91
|
-
|
91
|
+
|
92
92
|
tbodies = cassetteitems[i].find_all('tbody')
|
93
93
|
|
94
|
-
|
94
|
+
|
95
95
|
subtitle = cassetteitems[i].find_all("div",{
|
96
96
|
'class':'cassetteitem_content-title'})
|
97
97
|
subtitle = str(subtitle)
|
@@ -100,7 +100,7 @@
|
|
100
100
|
subtitle_rep2 = subtitle_rep.replace(
|
101
101
|
'</div>]', '')
|
102
102
|
|
103
|
-
|
103
|
+
|
104
104
|
subaddress = cassetteitems[i].find_all("li",{
|
105
105
|
'class':'cassetteitem_detail-col1'})
|
106
106
|
subaddress = str(subaddress)
|
@@ -109,16 +109,16 @@
|
|
109
109
|
subaddress_rep2 = subaddress_rep.replace(
|
110
110
|
'</li>]', '')
|
111
111
|
|
112
|
-
|
112
|
+
|
113
113
|
for y in range(len(tbodies)):
|
114
114
|
name.append(subtitle_rep2)
|
115
115
|
address.append(subaddress_rep2)
|
116
116
|
|
117
|
-
|
117
|
+
|
118
118
|
sublocations = cassetteitems[i].find_all("li",{
|
119
119
|
'class':'cassetteitem_detail-col2'})
|
120
120
|
|
121
|
-
|
121
|
+
|
122
122
|
for x in sublocations:
|
123
123
|
cols = x.find_all('div')
|
124
124
|
for i in range(len(cols)):
|
@@ -131,7 +131,7 @@
|
|
131
131
|
elif i == 2:
|
132
132
|
locations2.append(text)
|
133
133
|
|
134
|
-
|
134
|
+
|
135
135
|
tbodies = cassetteitems[i].find_all('tbody')
|
136
136
|
col3 = cassetteitems[i].find_all("li",{
|
137
137
|
'class':'cassetteitem_detail-col3'})
|
@@ -145,15 +145,15 @@
|
|
145
145
|
else:
|
146
146
|
height.append(text)
|
147
147
|
|
148
|
-
|
148
|
+
|
149
149
|
tables = summary.find_all('table')
|
150
150
|
|
151
|
-
|
151
|
+
|
152
152
|
rows = []
|
153
153
|
for i in range(len(tables)):
|
154
154
|
rows.append(tables[i].find_all('tr'))
|
155
155
|
|
156
|
-
|
156
|
+
|
157
157
|
data = []
|
158
158
|
for row in rows:
|
159
159
|
for tr in row:
|
@@ -162,7 +162,7 @@
|
|
162
162
|
text = td.find(text=True)
|
163
163
|
data.append(text)
|
164
164
|
|
165
|
-
|
165
|
+
|
166
166
|
index = 0
|
167
167
|
for item in data:
|
168
168
|
if '階' in item:
|
@@ -174,10 +174,10 @@
|
|
174
174
|
area.append(data[index+5])
|
175
175
|
index +=1
|
176
176
|
|
177
|
-
|
177
|
+
|
178
178
|
time.sleep(10)
|
179
179
|
|
180
|
-
|
180
|
+
|
181
181
|
name = Series(name)
|
182
182
|
address = Series(address)
|
183
183
|
locations0 = Series(locations0)
|
@@ -192,15 +192,15 @@
|
|
192
192
|
floor_plan = Series(floor_plan)
|
193
193
|
area = Series(area)
|
194
194
|
|
195
|
-
|
195
|
+
|
196
196
|
suumo_df = pd.concat([name, address, locations0, locations1,
|
197
197
|
locations2, age, height,floor,rent,admin,others,floor_plan,area],axis=1)
|
198
198
|
|
199
|
-
|
199
|
+
|
200
200
|
suumo_df.columns=['マンション名','住所','立地1','立地2','立地3','築年数','建物高さ','階','賃料','管理費',
|
201
201
|
'敷/礼/保証/敷引,償却','間取り','専有面積']
|
202
202
|
|
203
|
-
|
203
|
+
|
204
204
|
suumo_df.to_csv('suumo_adachi.csv', sep = '\t',encoding='utf-16')
|
205
205
|
|
206
206
|
|