質問編集履歴

8

2019/07/07 23:25

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
@@ -18,68 +18,68 @@
18
18
 
19
19
  parse error: Invalid numeric literal at line 3, column 0
20
20
 
21
+
22
+
23
+ ### ソースコード
24
+
25
+ ```Python
26
+
27
+
28
+
29
+ import scrapy
30
+
31
+ from myproject.items import Page
32
+
33
+ from myproject.utils import get_content
34
+
35
+ from bs4 import BeautifulSoup
36
+
37
+
38
+
39
+ class BroadSpider(scrapy.Spider):
40
+
41
+ name = 'broad'
42
+
43
+ allowed_domains = ['b.hatena.ne.jp/entrylist']
44
+
45
+ start_urls = ['http://b.hatena.ne.jp/entrylist/']
46
+
47
+
48
+
49
+ def parse(self, response):
50
+
51
+ print('\n\nresponse:{}\n\n'.format(response))
52
+
53
+ for url in response.css('.entrylist-contents-title a::attr("herf")').extract():
54
+
55
+ yield scrapy.Request(url,callback=self.parse_page)
56
+
57
+ print("\n\nurl:{}\n\n".format(url))
58
+
59
+
60
+
61
+
62
+
63
+ url_more=response.css('a::attr("href")').re_first(r'.*?of=\d{2}$')
64
+
65
+ print("\n\nurl_more:{}\n\n".format(url_more))
66
+
67
+ if url_more:
68
+
69
+ yield scrapy.Request(responce.urljoin(url_more))
70
+
71
+
72
+
73
+ def parse_page(self, response):
74
+
75
+ print('\n\npase_page\n\n')
76
+
77
+ title, content = get_content(reaponse.text)
78
+
79
+ yield Page(url=responce.url, title=title , content=content)
80
+
21
81
  ```
22
82
 
23
- ### ソースコード
24
-
25
- ```Python
26
-
27
-
28
-
29
- import scrapy
30
-
31
- from myproject.items import Page
32
-
33
- from myproject.utils import get_content
34
-
35
- from bs4 import BeautifulSoup
36
-
37
-
38
-
39
- class BroadSpider(scrapy.Spider):
40
-
41
- name = 'broad'
42
-
43
- allowed_domains = ['b.hatena.ne.jp/entrylist']
44
-
45
- start_urls = ['http://b.hatena.ne.jp/entrylist/']
46
-
47
-
48
-
49
- def parse(self, response):
50
-
51
- print('\n\nresponse:{}\n\n'.format(response))
52
-
53
- for url in response.css('.entrylist-contents-title a::attr("herf")').extract():
54
-
55
- yield scrapy.Request(url,callback=self.parse_page)
56
-
57
- print("\n\nurl:{}\n\n".format(url))
58
-
59
-
60
-
61
-
62
-
63
- url_more=response.css('a::attr("href")').re_first(r'.*?of=\d{2}$')
64
-
65
- print("\n\nurl_more:{}\n\n".format(url_more))
66
-
67
- if url_more:
68
-
69
- yield scrapy.Request(responce.urljoin(url_more))
70
-
71
-
72
-
73
- def parse_page(self, response):
74
-
75
- print('\n\npase_page\n\n')
76
-
77
- title, content = get_content(reaponse.text)
78
-
79
- yield Page(url=responce.url, title=title , content=content)
80
-
81
- ```
82
-
83
83
  ###scrapy crawl broad -o page.jlを実行した結果
84
84
 
85
85
 

7

2019/07/07 23:25

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
@@ -20,11 +20,11 @@
20
20
 
21
21
  ```
22
22
 
23
- ###ソースコード
23
+ ### ソースコード
24
24
 
25
25
  ```Python
26
26
 
27
- `
27
+
28
28
 
29
29
  import scrapy
30
30
 

6

2019/07/07 23:24

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
File without changes

5

2019/07/07 23:23

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
@@ -20,6 +20,8 @@
20
20
 
21
21
  ```
22
22
 
23
+ ###ソースコード
24
+
23
25
  ```Python
24
26
 
25
27
  `

4

2019/07/07 23:23

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
@@ -20,70 +20,64 @@
20
20
 
21
21
  ```
22
22
 
23
-
23
+ ```Python
24
+
24
-
25
+ `
26
+
27
+ import scrapy
28
+
29
+ from myproject.items import Page
30
+
31
+ from myproject.utils import get_content
32
+
33
+ from bs4 import BeautifulSoup
34
+
35
+
36
+
37
+ class BroadSpider(scrapy.Spider):
38
+
39
+ name = 'broad'
40
+
41
+ allowed_domains = ['b.hatena.ne.jp/entrylist']
42
+
43
+ start_urls = ['http://b.hatena.ne.jp/entrylist/']
44
+
45
+
46
+
47
+ def parse(self, response):
48
+
49
+ print('\n\nresponse:{}\n\n'.format(response))
50
+
51
+ for url in response.css('.entrylist-contents-title a::attr("herf")').extract():
52
+
53
+ yield scrapy.Request(url,callback=self.parse_page)
54
+
55
+ print("\n\nurl:{}\n\n".format(url))
56
+
57
+
58
+
59
+
60
+
61
+ url_more=response.css('a::attr("href")').re_first(r'.*?of=\d{2}$')
62
+
63
+ print("\n\nurl_more:{}\n\n".format(url_more))
64
+
25
- ### 該当のソースコード
65
+ if url_more:
66
+
26
-
67
+ yield scrapy.Request(responce.urljoin(url_more))
68
+
69
+
70
+
27
-
71
+ def parse_page(self, response):
72
+
73
+ print('\n\npase_page\n\n')
74
+
75
+ title, content = get_content(reaponse.text)
76
+
77
+ yield Page(url=responce.url, title=title , content=content)
28
78
 
29
79
  ```
30
80
 
31
- import scrapy
32
-
33
- from myproject.items import Page
34
-
35
- from myproject.utils import get_content
36
-
37
- from bs4 import BeautifulSoup
38
-
39
-
40
-
41
- class BroadSpider(scrapy.Spider):
42
-
43
- name = 'broad'
44
-
45
- allowed_domains = ['b.hatena.ne.jp/entrylist']
46
-
47
- start_urls = ['http://b.hatena.ne.jp/entrylist/']
48
-
49
-
50
-
51
- def parse(self, response):
52
-
53
- print('\n\nresponse:{}\n\n'.format(response))
54
-
55
- for url in response.css('.entrylist-contents-title a::attr("herf")').extract():
56
-
57
- yield scrapy.Request(url,callback=self.parse_page)
58
-
59
- print("\n\nurl:{}\n\n".format(url))
60
-
61
-
62
-
63
-
64
-
65
- url_more=response.css('a::attr("href")').re_first(r'.*?of=\d{2}$')
66
-
67
- print("\n\nurl_more:{}\n\n".format(url_more))
68
-
69
- if url_more:
70
-
71
- yield scrapy.Request(responce.urljoin(url_more))
72
-
73
-
74
-
75
- def parse_page(self, response):
76
-
77
- print('\n\npase_page\n\n')
78
-
79
- title, content = get_content(reaponse.text)
80
-
81
- yield Page(url=responce.url, title=title , content=content)
82
-
83
- ```
84
-
85
-
86
-
87
81
  ###scrapy crawl broad -o page.jlを実行した結果
88
82
 
89
83
 

3

2019/07/07 23:22

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
@@ -14,20 +14,20 @@
14
14
 
15
15
 
16
16
 
17
+
18
+
19
+ parse error: Invalid numeric literal at line 3, column 0
20
+
17
21
  ```
18
22
 
23
+
24
+
19
- parse error: Invalid numeric literal at line 3, column 0
25
+ ### 該当のソースコード
26
+
27
+
20
28
 
21
29
  ```
22
30
 
23
-
24
-
25
- ### 該当のソースコード
26
-
27
-
28
-
29
- ```python
30
-
31
31
  import scrapy
32
32
 
33
33
  from myproject.items import Page
@@ -80,7 +80,7 @@
80
80
 
81
81
  yield Page(url=responce.url, title=title , content=content)
82
82
 
83
-
83
+ ```
84
84
 
85
85
 
86
86
 

2

誤字修正

2019/07/07 23:20

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
@@ -234,4 +234,4 @@
234
234
 
235
235
 
236
236
 
237
- pagesの中身OKBで何も書かれていない状態でした。
237
+ pageは生成されていませんでした。

1

scrapy crawl broad -o page.jlを実行した結果を追記しました。

2019/07/07 16:07

投稿

abokadoishii
abokadoishii

スコア12

test CHANGED
File without changes
test CHANGED
@@ -26,9 +26,7 @@
26
26
 
27
27
 
28
28
 
29
- ```ここに言語名を入力
30
-
31
- # -*- python -*-
29
+ ```python
32
30
 
33
31
  import scrapy
34
32
 
@@ -84,6 +82,156 @@
84
82
 
85
83
 
86
84
 
87
-
88
-
89
- ```
85
+
86
+
87
+ ###scrapy crawl broad -o page.jlを実行した結果
88
+
89
+
90
+
91
+ Page
92
+
93
+
94
+
95
+ 2019-07-08 00:57:35 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: myproject)
96
+
97
+ 2019-07-08 00:57:35 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.3 (v3.7.3:ef4ec6ed12, Mar 25 2019, 21:26:53) [MSC v.1916 32 bit (Intel)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b 26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.17134-SP0
98
+
99
+ 2019-07-08 00:57:35 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'myproject', 'DOWNLOAD_DELAY': 3, 'FEED_FORMAT': 'jl', 'FEED_URI': 'page.jl', 'NEWSPIDER_MODULE': 'myproject.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['myproject.spiders']}
100
+
101
+ 2019-07-08 00:57:35 [scrapy.extensions.telnet] INFO: Telnet Password: 2305ec51de6031e2
102
+
103
+ 2019-07-08 00:57:35 [scrapy.middleware] INFO: Enabled extensions:
104
+
105
+ ['scrapy.extensions.corestats.CoreStats',
106
+
107
+ 'scrapy.extensions.telnet.TelnetConsole',
108
+
109
+ 'scrapy.extensions.feedexport.FeedExporter',
110
+
111
+ 'scrapy.extensions.logstats.LogStats']
112
+
113
+ 2019-07-08 00:57:36 [scrapy.middleware] INFO: Enabled downloader middlewares:
114
+
115
+ ['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
116
+
117
+ 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
118
+
119
+ 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
120
+
121
+ 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
122
+
123
+ 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
124
+
125
+ 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
126
+
127
+ 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
128
+
129
+ 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
130
+
131
+ 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
132
+
133
+ 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
134
+
135
+ 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
136
+
137
+ 'scrapy.downloadermiddlewares.stats.DownloaderStats']
138
+
139
+ 2019-07-08 00:57:36 [scrapy.middleware] INFO: Enabled spider middlewares:
140
+
141
+ ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
142
+
143
+ 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
144
+
145
+ 'scrapy.spidermiddlewares.referer.RefererMiddleware',
146
+
147
+ 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
148
+
149
+ 'scrapy.spidermiddlewares.depth.DepthMiddleware']
150
+
151
+ 2019-07-08 00:57:36 [scrapy.middleware] INFO: Enabled item pipelines:
152
+
153
+ []
154
+
155
+ 2019-07-08 00:57:36 [scrapy.core.engine] INFO: Spider opened
156
+
157
+ 2019-07-08 00:57:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
158
+
159
+ 2019-07-08 00:57:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
160
+
161
+ 2019-07-08 00:57:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://b.hatena.ne.jp/robots.txt> (referer: None)
162
+
163
+ 2019-07-08 00:57:40 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://b.hatena.ne.jp/entrylist/> from <GET http://b.hatena.ne.jp/entrylist/>
164
+
165
+ 2019-07-08 00:57:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://b.hatena.ne.jp/entrylist/> (referer: None)
166
+
167
+
168
+
169
+
170
+
171
+ response:<200 https://b.hatena.ne.jp/entrylist/>
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+ url_more:None
182
+
183
+
184
+
185
+
186
+
187
+ 2019-07-08 00:57:43 [scrapy.core.engine] INFO: Closing spider (finished)
188
+
189
+ 2019-07-08 00:57:43 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
190
+
191
+ {'downloader/request_bytes': 715,
192
+
193
+ 'downloader/request_count': 3,
194
+
195
+ 'downloader/request_method_count/GET': 3,
196
+
197
+ 'downloader/response_bytes': 43640,
198
+
199
+ 'downloader/response_count': 3,
200
+
201
+ 'downloader/response_status_count/200': 2,
202
+
203
+ 'downloader/response_status_count/301': 1,
204
+
205
+ 'finish_reason': 'finished',
206
+
207
+ 'finish_time': datetime.datetime(2019, 7, 7, 15, 57, 43, 302541),
208
+
209
+ 'log_count/DEBUG': 3,
210
+
211
+ 'log_count/INFO': 9,
212
+
213
+ 'response_received_count': 2,
214
+
215
+ 'robotstxt/request_count': 1,
216
+
217
+ 'robotstxt/response_count': 1,
218
+
219
+ 'robotstxt/response_status_count/200': 1,
220
+
221
+ 'scheduler/dequeued': 2,
222
+
223
+ 'scheduler/dequeued/memory': 2,
224
+
225
+ 'scheduler/enqueued': 2,
226
+
227
+ 'scheduler/enqueued/memory': 2,
228
+
229
+ 'start_time': datetime.datetime(2019, 7, 7, 15, 57, 36, 203946)}
230
+
231
+ 2019-07-08 00:57:43 [scrapy.core.engine] INFO: Spi
232
+
233
+ der closed (finished)
234
+
235
+
236
+
237
+ pagesの中身はOKBで何も書かれていない状態でした。