質問編集履歴
1
変更
test
CHANGED
File without changes
|
test
CHANGED
@@ -1,237 +1,7 @@
|
|
1
|
-
|
1
|
+
scrapyでクローリングしたいときに
|
2
2
|
|
3
|
-
|
3
|
+
RuleとLinkExtractorを使用し、ページを辿りたいのですが、
|
4
4
|
|
5
|
+
表示されているページャーの番号のリンクを辿ってしまいます。
|
5
6
|
|
6
|
-
|
7
|
-
実現したいこと
|
8
|
-
|
9
|
-
正常にクロールしたい
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
状況
|
14
|
-
|
15
|
-
http://777.slopachi-station.com/report_schedule/
|
16
|
-
|
17
|
-
上記のサイトの取材スケジュールを取得するスパイダーを作成しようとしたのですが、リンクがうまく辿れていないみたいです。
|
18
|
-
|
19
|
-
上記のページからページャをたどり、各ページに記載されてる店舗ページにアクセスし、情報を取得しようと考えています。
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
```Python3
|
24
|
-
|
25
|
-
from scrapy.spiders import CrawlSpider, Rule
|
26
|
-
|
27
|
-
from scrapy.linkextractors import LinkExtractor
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
from slotcrawler.items import Shop_data
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
class SlotSpider(CrawlSpider):
|
38
|
-
|
39
|
-
name = 'slot'
|
40
|
-
|
41
|
-
allowed_domains = ["777.slopachi-station.com"]
|
42
|
-
|
43
|
-
start_urls = (
|
44
|
-
|
45
|
-
'http://777.slopachi-station.com/report_schedule/',
|
46
|
-
|
47
|
-
)
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
rules = [
|
52
|
-
|
53
|
-
Rule(LinkExtractor(allow=r'/report_schedule/page/\d+/')),
|
54
|
-
|
55
|
-
Rule(LinkExtractor(allow=r'/shop_data/\d+/'),
|
56
|
-
|
57
|
-
follow=True,
|
58
|
-
|
59
|
-
callback='parse_hall'),
|
60
|
-
|
61
|
-
]
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
def parse_hall(self, response):
|
66
|
-
|
67
|
-
item=Shop_data(
|
68
|
-
|
69
|
-
name=response.xpath('//*[@id="shopDetail"]/div[1]/h2/text()').extract_first(),
|
70
|
-
|
71
|
-
scedule=response.xpath('//*[@id="report_schedule"]/div[@class="resultRow resultRow-tile"]/text()').extract(),
|
72
|
-
|
73
|
-
add1=response.xpath('//*[@id="breadcrumb"]/a[2]/text()').extract_first(),
|
74
|
-
|
75
|
-
add2=response.xpath('//*[@id="breadcrumb"]/a[3]/text()').extract_first(),
|
76
|
-
|
77
|
-
)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
yield item
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
```
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
```items.py
|
90
|
-
|
91
|
-
import scrapy
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
class Shop_data(scrapy.Item):
|
98
|
-
|
99
|
-
name = scrapy.Field
|
100
|
-
|
101
|
-
scedule = scrapy.Field
|
102
|
-
|
103
|
-
add1 = scrapy.Field
|
104
|
-
|
105
|
-
add2 = scrapy.Field
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
```
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
seting.pyはダウンロード間隔を追加しました。
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
```terminal
|
118
|
-
|
119
|
-
2018-04-15 22:24:53 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: slotcrawler)
|
120
|
-
|
121
|
-
2018-04-15 22:24:53 [scrapy.utils.log] INFO: Versions: lxml 3.5.0.0, libxml2 2.9.3, cssselect 1.0.1, parsel 1.3.1, w3lib 1.18.0, Twisted 17.9.0, Python 3.5.2 (default, Nov 23 2017, 16:37:01) - [GCC 5.4.0 20160609], pyOpenSSL 17.5.0 (OpenSSL 1.1.0g 2 Nov 2017), cryptography 2.1.4, Platform Linux-4.4.0-119-generic-x86_64-with-Ubuntu-16.04-xenial
|
122
|
-
|
123
|
-
2018-04-15 22:24:53 [scrapy.crawler] INFO: Overridden settings: {'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['slotcrawler.spiders'], 'NEWSPIDER_MODULE': 'slotcrawler.spiders', 'FEED_FORMAT': 'jl', 'FEED_URI': 'hall.jl', 'DOWNLOAD_DELAY': 1, 'BOT_NAME': 'slotcrawler'}
|
124
|
-
|
125
|
-
2018-04-15 22:24:53 [scrapy.middleware] INFO: Enabled extensions:
|
126
|
-
|
127
|
-
['scrapy.extensions.corestats.CoreStats',
|
128
|
-
|
129
|
-
'scrapy.extensions.memusage.MemoryUsage',
|
130
|
-
|
131
|
-
'scrapy.extensions.logstats.LogStats',
|
132
|
-
|
133
|
-
'scrapy.extensions.feedexport.FeedExporter',
|
134
|
-
|
135
|
-
'scrapy.extensions.telnet.TelnetConsole']
|
136
|
-
|
137
|
-
2018-04-15 22:24:53 [scrapy.middleware] INFO: Enabled downloader middlewares:
|
138
|
-
|
139
|
-
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
|
140
|
-
|
141
|
-
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
|
142
|
-
|
143
|
-
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
|
144
|
-
|
145
|
-
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
|
146
|
-
|
147
|
-
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
|
148
|
-
|
149
|
-
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
|
150
|
-
|
151
|
-
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
|
152
|
-
|
153
|
-
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
|
154
|
-
|
155
|
-
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
|
156
|
-
|
157
|
-
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
|
158
|
-
|
159
|
-
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
|
160
|
-
|
161
|
-
'scrapy.downloadermiddlewares.stats.DownloaderStats']
|
162
|
-
|
163
|
-
2018-04-15 22:24:53 [scrapy.middleware] INFO: Enabled spider middlewares:
|
164
|
-
|
165
|
-
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
|
166
|
-
|
167
|
-
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
|
168
|
-
|
169
|
-
'scrapy.spidermiddlewares.referer.RefererMiddleware',
|
170
|
-
|
171
|
-
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
|
172
|
-
|
173
|
-
'scrapy.spidermiddlewares.depth.DepthMiddleware']
|
174
|
-
|
175
|
-
2018-04-15 22:24:53 [scrapy.middleware] INFO: Enabled item pipelines:
|
176
|
-
|
177
|
-
[]
|
178
|
-
|
179
|
-
2018-04-15 22:24:53 [scrapy.core.engine] INFO: Spider opened
|
180
|
-
|
181
|
-
2018-04-15 22:24:53 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
182
|
-
|
183
|
-
2018-04-15 22:24:53 [scrapy.extensions.telnet] DEBUG: Telnet console listening on *************
|
184
|
-
|
185
|
-
2018-04-15 22:24:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://777.slopachi-station.com/robots.txt> (referer: None)
|
186
|
-
|
187
|
-
2018-04-15 22:24:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://777.slopachi-station.com/report_schedule/> (referer: None)
|
188
|
-
|
189
|
-
2018-04-15 22:24:55 [scrapy.core.engine] INFO: Closing spider (finished)
|
190
|
-
|
191
|
-
2018-04-15 22:24:55 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
|
192
|
-
|
193
|
-
{'downloader/request_bytes': 534,
|
194
|
-
|
195
|
-
'downloader/request_count': 2,
|
196
|
-
|
197
|
-
'downloader/request_method_count/GET': 2,
|
198
|
-
|
199
|
-
'downloader/response_bytes': 11277,
|
200
|
-
|
201
|
-
'downloader/response_count': 2,
|
202
|
-
|
203
|
-
'downloader/response_status_count/200': 2,
|
204
|
-
|
205
|
-
'finish_reason': 'finished',
|
206
|
-
|
207
|
-
'finish_time': datetime.datetime(2018, 4, 15, 13, 24, 55, 9470),
|
208
|
-
|
209
|
-
'log_count/DEBUG': 3,
|
210
|
-
|
211
|
-
'log_count/INFO': 7,
|
212
|
-
|
213
|
-
'memusage/max': 54620160,
|
214
|
-
|
215
|
-
'memusage/startup': 54620160,
|
216
|
-
|
217
|
-
'response_received_count': 2,
|
218
|
-
|
219
|
-
'scheduler/dequeued': 1,
|
220
|
-
|
221
|
-
'scheduler/dequeued/memory': 1,
|
222
|
-
|
223
|
-
'scheduler/enqueued': 1,
|
224
|
-
|
225
|
-
'scheduler/enqueued/memory': 1,
|
226
|
-
|
227
|
-
'start_time': datetime.datetime(2018, 4, 15, 13, 24, 53, 523839)}
|
228
|
-
|
229
|
-
2018-04-15 22:24:55 [scrapy.core.engine] INFO: Spider closed (finished)
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
```
|
234
|
-
|
235
|
-
このような感じでそもそもページャーも詳細ページもたどれてないみたいです。
|
236
|
-
|
237
|
-
解決策を教えてください。
|
7
|
+
次へのリンクをたどるにはどうすればいいでしょうか?
|