teratail header banner
teratail header banner
質問するログイン新規登録

質問編集履歴

4

2019/05/13 22:41

投稿

RMBQsKe5AP10gjx
RMBQsKe5AP10gjx

スコア24

title CHANGED
File without changes
body CHANGED
@@ -10,7 +10,7 @@
10
10
 
11
11
  ```
12
12
  const puppeteer = require("puppeteer");
13
- var TARGET_URL = "/"
13
+ var TARGET_URL = "ヤフーニュース"
14
14
  var datas = [];
15
15
  var items = {};
16
16
  var r = 0;

3

2019/05/13 22:41

投稿

RMBQsKe5AP10gjx
RMBQsKe5AP10gjx

スコア24

title CHANGED
File without changes
body CHANGED
@@ -10,7 +10,7 @@
10
10
 
11
11
  ```
12
12
  const puppeteer = require("puppeteer");
13
- var TARGET_URL = "https://anond.hatelabo.jp/"
13
+ var TARGET_URL = "/"
14
14
  var datas = [];
15
15
  var items = {};
16
16
  var r = 0;

2

コードの修正

2019/05/13 22:41

投稿

RMBQsKe5AP10gjx
RMBQsKe5AP10gjx

スコア24

title CHANGED
File without changes
body CHANGED
@@ -10,52 +10,64 @@
10
10
 
11
11
  ```
12
12
  const puppeteer = require("puppeteer");
13
- var TARGET_URL = "https://news.yahoo.co.jp/list/"
13
+ var TARGET_URL = "https://anond.hatelabo.jp/"
14
- var LINK_LEVEL = 3;
15
14
  var datas = [];
16
- var items = [];
15
+ var items = {};
17
-
16
+ var r = 0;
18
17
  puppeteer.launch({
19
- args:['--no-sandbox','--disable-gpu','--ignore-certificate-errors'],
18
+ args:['--no-sandbox','--disable-gpu','--ignore-certificate-errors'],
20
- ignoreDefaultArgs: ['--disable-extentions'],
19
+ ignoreDefaultArgs: ['--disable-extentions'],
21
- ignoreHTTPSErrors: false,
20
+ ignoreHTTPSErrors: false,
22
- headless:false,
21
+ headless:false,
23
- slowMo :300
22
+ slowMo :300
24
23
 
25
24
  }).then(async browser => {
26
25
  try{
27
26
 
28
27
  async function downloadRec(url,level,i){
29
28
 
30
- const page = await browser.newPage();
29
+ const page = await browser.newPage();
31
- await page.goto(url,{waitUntil:"domcontentloaded"});
30
+ await page.goto(url,{waitUntil:"domcontentloaded"});
32
-
31
+ console.log(url);
33
32
  //-----------------------リンク一覧を配列に入れる----------------------
34
- const isLoadingSucceeded = await page.$('li.next a[href]').then(res => !!res);
35
33
 
36
- if (isLoadingSucceeded){
37
- while (isLoadingSucceeded) {
38
- let lists2 = await page.$$("a[href]");
34
+ let lists2 = await page.$$("a[href]");
39
- // 3ページ目まで
35
+
40
- if(level >= LINK_LEVEL )return;
41
- //基準ページ以外であれば無視
42
- var us =TARGET_URL.split("/");
43
- us.pop();
44
- var base = us.join("/");
36
+ for (let i = 0; i < lists2.length; i++) {
45
- if (url.indexOf(base) < 0)return;
46
37
  //既出のサイトであれば無視
47
- if(i[datas[i]]);
48
- for (let i = 0; i < lists2.length; i++) {
38
+ var removeDuplicates = function(object) {
39
+ var result = [], comparisons = [], key, comparison;
40
+ for (key in object) {
41
+ comparison = JSON.stringify(object[key]);
42
+ if (comparisons.indexOf(comparison) === -1) {
43
+ result.push(object[key]);
44
+ }
45
+ comparisons.push(comparison);
46
+ }
47
+
48
+ return result;
49
+ };
50
+
51
+
49
- datas.push(await (await lists2[i].getProperty('href')).jsonValue());
52
+ datas.push(await (await lists2[i].getProperty('href')).jsonValue());
53
+ result2 = await result[i].indexOf(TARGET_URL);
54
+ //await console.log(result2);
55
+ //外部サイトであれば無視
56
+ if (result2 != -1){
57
+ var result = removeDuplicates(datas);
50
- items.push( {[i] : datas[i]} );
58
+ items[r] = result[i] ;
59
+ r = r + 1;
60
+ };
51
- await console.log(datas);
61
+ await console.log(items);
62
+ };
63
+
52
- await page.click('li.next a[href]',{waitUntil:"domcontentloaded"});
64
+ await page.goto(url,{waitUntil:"domcontentloaded"});
65
+
66
+
53
-
67
+ await console.log(items[r]);
68
+ await downloadRec(items[r+1],level+1,i+1);
69
+ await console.log(downloadRec);
54
70
  };
55
- };
56
- };
57
- downloadRec(datas[i+1],level+1,i+1);
58
- };
59
71
  //----------------------------------------------------------------------------
60
72
 
61
73
  await downloadRec(TARGET_URL,0,0);

1

2019/05/13 22:32

投稿

RMBQsKe5AP10gjx
RMBQsKe5AP10gjx

スコア24

title CHANGED
File without changes
body CHANGED
File without changes