PDFのテキストをJavascript（クライアント/ブラウザ）で抽出したいがうまくいきません。

実現したいこと

掲題の通り、PDFのテキストをJavaScriptを利用して抽出したいと考えております。そこで利用可能なものとして、pdf.js-extractに見つけ、興味を持ち利用したいと考えております。

前提

ここでいうJavaScriptはクライアント・ブラウザで動作させることができるものとして考えております。
当方npm等のパッケージ管理には不案内ではありますが、それ自体の導入は無事完了npm --ver 10.2.3しており、pdf.js-extractにある指示に従い、npm i pdf.js-extractによりローカルインストールを完了しております。

発生している問題・エラーメッセージ

上記の準備を終え、Example Usageにある通り、以下のコードをindex.htmlで実行いたしました。

HTML
1<!--index.html-->
2<!DOCTYPE html>
3<html lang="ja">
4
5<head>
6    <meta charset="UTF-8">
7    <meta name="viewport" content="width=device-width, initial-scale=1.0">
8    <title>Document</title>
9
10</head>
11
12<body>
13
14    <script>
15        const PDFExtract = require('pdf.js-extract').PDFExtract;
16        const pdfExtract = new PDFExtract();
17        const options = {}; /* see below */
18        pdfExtract.extract('test.pdf', options, (err, data) => {
19            if (err) return console.log(err);
20            console.log(data);
21        });
22    </script>
23</body>
24</html>

するとブラウザのDevelper Tool上以下のようなメッセージが出力されました。

Uncaught ReferenceError: require is not defined

試したこと

この原因が調べてみるとrequireがnode.jsで利用可能なもので、ブラウザやクライアントサイドのJavaScriptではそのまま利用できないことをこちらの記事で知りました。
同記事にもある、browserifyでbundleして利用可能なものとすべく、npmでそれぞれ以下を叩きました。
npm install -g browserify
npm install pdf.js-extract
browserify parse.js -o bundle.js
すると最終的に以下のコマンドプロンプトで以下メッセージが出力され、bundleすることができずに、行き詰ってしまいました。

C:\Users\XXX\Desktop\PJT_PP>browserify parse.js -o bundle.js
Error: Can't walk dependency graph: Cannot find module 'canvas' from 'C:\Users\XXX\Desktop\PJT_PP\node_modules\pdf.js-extract\lib\pdfjs\pdf.js'
    required by C:\Users\XXX\Desktop\PJT_PP\node_modules\pdf.js-extract\lib\pdfjs\pdf.js
    at C:\Users\XXX\AppData\Roaming\npm\node_modules\browserify\node_modules\resolve\lib\async.js:146:35
    at processDirs (C:\Users\XXX\AppData\Roaming\npm\node_modules\browserify\node_modules\resolve\lib\async.js:299:39)
    at isdir (C:\Users\XXX\AppData\Roaming\npm\node_modules\browserify\node_modules\resolve\lib\async.js:306:32)
    at C:\Users\XXX\AppData\Roaming\npm\node_modules\browserify\node_modules\resolve\lib\async.js:34:69
    at FSReqCallback.oncomplete (node:fs:199:21)

本質的にはJavaScriptでPDFのテキストを抽出したいというところではあるのですが、他にあまりより代替案も見つけることができず、このbrowserifyをうまく突破するためのアドバイスをいただけると幸いです。

npmの状況

package.json
1{
2  "dependencies": {
3    "browserify": "^17.0.0",
4    "http-server": "^14.1.1",
5    "pdf.js-extract": "^0.2.1",
6    "uniq": "^1.0.1"
7  }
8}

よろしくお願い申し上げます。

【追記】
修正コード

html
1<!--index.html-->
2<!DOCTYPE html>
3<html lang="ja">
4
5<head>
6    <meta charset="UTF-8">
7    <meta name="viewport" content="width=device-width, initial-scale=1.0">
8    <title>Document</title>
9    <script src="https://cdn.jsdelivr.net/npm/pdfjs-dist@2.13.216/legacy/build/pdf.min.js"></script>
10</head>
11
12<body>
13    <script type="module" src="http://localhost:8080/parse.js"></script>
14</body>
15
16</html>

JavaScript
1    import pdfjsLib from 'pdfjs-dist'
2
3    function extractBuffer(buffer, options = {}, cb) {
4            if (!cb) {
5                return new Promise((resolve, reject) => {
6                    this.extractBuffer(buffer, options, (err, data) => {
7                        if (err) {
8                            reject(err);
9                        } else {
10                            resolve(data);
11                        }
12                    })
13                });
14            }
15            // Loading file from file system into typed array
16            if (options.verbosity === undefined) {
17                // get rid of all warnings in nodejs usage
18                options.verbosity = -1;
19            }
20            if (options.cMapUrl === undefined) {
21                options.cMapUrl = path.join(__dirname, "./cmaps/"); // trailing path delimiter is important
22            }
23            if (options.cMapPacked === undefined) {
24                options.cMapPacked = true;
25            }
26            if (options.CMapReaderFactory === undefined) {
27                options.CMapReaderFactory = LocalCMapReaderFactory;
28            }
29            options.data = new Uint8Array(buffer);
30            const pdf = {
31                meta: {},
32                pages: []
33            };
34            // Will be using promises to load document, pages and misc data instead of callback.
35            pdfjsLib.getDocument(options).promise.then(doc => {
36                const firstPage = (options && options.firstPage) ? options.firstPage : 1;
37                const lastPage = Math.min((options && options.lastPage) ? options.lastPage : doc.numPages, doc.numPages);
38                pdf.pdfInfo = doc.pdfInfo;
39                const promises = [
40                    doc.getMetadata().then(data => {
41                        pdf.meta = {info: data.info, metadata: data.metadata ? data.metadata.getAll() || null : null};
42                    })
43                ];
44                const loadPage = pageNum => doc.getPage(pageNum).then(page => {
45                    const viewport = page.getViewport({scale: 1.0});
46                    const pag = {
47                        pageInfo: {
48                            num: pageNum,
49                            scale: viewport.scale,
50                            rotation: viewport.rotation,
51                            offsetX: viewport.offsetX,
52                            offsetY: viewport.offsetY,
53                            width: viewport.width,
54                            height: viewport.height
55                        }
56                    };
57                    pdf.pages.push(pag);
58                    const normalizeWhitespace = !!(options && options.normalizeWhitespace === true);
59                    const disableCombineTextItems = !!(options && options.disableCombineTextItems === true);
60                    return Promise.all([
61                        page.getAnnotations().then((annotations) => {
62                            pag.links = annotations.filter((annot) => annot.subtype === "Link" && !!annot.url)
63                            .map((link) => link.url);
64                        }),
65                        page.getTextContent({normalizeWhitespace, disableCombineTextItems}).then((content) => {
66                            // Content contains lots of information about the text layout and styles, but we need only strings at the moment
67                            pag.content = content.items.map(item => {
68                                const tm = item.transform;
69                                let x = tm[4];
70                                let y = pag.pageInfo.height - tm[5];
71                                if (viewport.rotation === 90) {
72                                    x = tm[5];
73                                    y = tm[4];
74                                }
75                                // see https://github.com/mozilla/pdf.js/issues/8276
76                                const height = Math.sqrt(tm[2] * tm[2] + tm[3] * tm[3]);
77                                return {
78                                    x: x,
79                                    y: y,
80                                    str: item.str,
81                                    dir: item.dir,
82                                    width: item.width,
83                                    height: height,
84                                    fontName: item.fontName
85                                };
86                            });
87                        })
88                    ]).then(() => {
89                        // console.log("done page content parsing");
90                    }, (err) => {
91                        cb(err);
92                    });
93                });
94                for (let i = firstPage; i <= lastPage; i++) {
95                    promises.push(loadPage(i));
96                }
97                return Promise.all(promises);
98            }).then(() => {
99                pdf.pages.sort((a, b) => a.pageInfo.num - b.pageInfo.num);
100                cb(null, pdf);
101            }, (err) => {
102                cb(err)
103            });
104        }
105
106

hoshi-takanori

2023/12/20 20:58

ブラウザで動かすには修正が必要みたいですね。 https://github.com/ffalt/pdf.js-extract/issues/11

SugiuraY

2023/12/21 00:49

コメントありがとうございます。内容をみるとextractBuffer()メソッドをfunctionのように書換なさいということで、その前提としてimport pdfjsLib from 'pdfjs-dist'とありました。 1. 直接HTMLファイルのscriptタグでimportを記載すると「Cannot use import statement outside a modul」とブラウザのDeveloper Tool側でエラーが出力されたため、その後調べて<script module="parse.js">として、外部のparse.jsに以下のコードを置くことにいたしました。 import pdfjsLib from 'pdfjs-dist' function extractBuffer(buffer, options = {}, cb) { } 2.すると今度は、「file:///C:/Users/XXX/Desktop/PJT_PP/parse.js' from origin 'null' has been blocked by CORS policy:XXXX」とDeveloper Toolでエラーが出力され、これも調べるとローカルにあるファイルをコードから見に行くことはセキュリティ上できないため、いったん最速の方法としてローカルホスティング環境において、これを見に行くことにしました。 3-1. phpを書くこともありMAMP環境もあるため、htdocsにこのファイルを格納してhttp://localhostXXX.parse.jsをサーバーを起動したうえで参照しにいっても、上記と同じエラーが解消されませんでした。 3-2.以下の記事にあったようなhttp-serverを利用した簡単環境構築で<script module="http://localhost:8080/parse.js">で参照しに行ってみても、「'http://localhost:8080/parse.js' from origin 'null' has been blocked by CORS policy: 」のエラーが出力されここで行き詰ってしまいました。。。 https://qiita.com/terufumi1122/items/39b2a3659bc585c07f64

Lhankor_Mhy

2023/12/21 03:40

ブラウザでの使用でいいんですよね？ <script module="http://localhost:8080/parse.js"> とのことですが、標準的なHTMLですと script 要素に module という属性はないのですが、何かのフレームワークを使っていますか？　コンパイルされると別の記述になるとかそういうことですか？ https://developer.mozilla.org/ja/docs/Web/HTML/Element/script

SugiuraY

2023/12/21 06:58

コメントありがとうございます。しかしながら、以下のいずれの方法でも「SyntaxError: Cannot use import statement outside a module」で正しく読み込むことができませんでした。  <script>import pdfjsLib from 'pdfjs-dist'</script>  <script src="parse.js"></script> なお、package.jsonの状況については、本文に追記させていただきます。

Lhankor_Mhy

2023/12/21 07:07

とりあえず、import文はモジュールの中でしか使えないので、そのエラーが出ています。私が指摘しているのは「moduleという属性はない」であって、「モジュールを使うな」ではありません。 https://developer.mozilla.org/ja/docs/Web/JavaScript/Reference/Statements/import なお、動的インポートであれば、モジュールの外でも使えます。

SugiuraY

2023/12/21 08:04

コメントを頂きありがとうございます。 node.jsやimportにおける「モジュール」の考え方がわからずに、これもずっと調べています。エラーメッセージ等から以下のようなところへたどり着き、やはり属性として、フレームワーク等の断りなくmodule属性を使用していました。 https://web-engineer-wiki.com/javascript/error-import-statement/ 恐らく調べる方向性として間違っている気がしていて、そもそもモジュールを理解すべきとも探しているのですが、この文脈でのモジュールとはnode.jsにおけるモジュールの概念で調べるべきか、Javascriptにおける概念で調べるべきか、もしくはブラウザの仕様等より一般的広義なものとして調べるべきなのか、道標を頂けますでしょうか？

Lhankor_Mhy

2023/12/21 08:15

そのリンク先のとおりで合っていると思いますよ。type="module" と書くので合っていると思います。

SugiuraY

2023/12/21 09:32

ありがとうございます。上記コードの追加の通りに至りました。 Access to script at 'http://localhost:8080/parse.js' from origin 'null' has been blocked by CORS policy: No 'Access-Control-Allow-Origin' header is present on the requested resource. でまた阻まれ、http-serverでローカルホスト化した（上記はその結果のエラーコードで、その前は普通にparse.jsです）のですが、またここに戻ってきてしまいました・・

Lhankor_Mhy

2023/12/22 02:08

そのエラーメッセージを再現させることができないでいますが、もしかして、parse.jsだけをサーバ上で動かして、HTMLファイルをサーバ上で実行するのではなくて file:// で実行してるとかそういうことはないですか？

SugiuraY

2023/12/22 05:41

ありがとうございます。ご指摘の点はおっしゃる通りでした。長くお付き合いいただいてしまい申し訳ないのですが、importができないようです。 Failed to resolve module specifier "pdfjs-dist". Relative references must start with either "/", "./", or "../". importをするときは絶対的なアドレス指定に置き換えなくてはいけないというものを読んだことが記憶はあるのですが、もともとhoshi-takanori様にご指摘いただいた下記のissuesの中でも特にしていなくimportを使っているように見えます。 https://github.com/ffalt/pdf.js-extract/issues/11 node_modulesの中にあるpdf.js-extractに「pdfjs-dist」というフォルダやファイルは見当たらず、npmで導入したパッケージがimportでどのようにファイルを参照しているかの仕組みを存じあげず稚拙な質問で申し訳ございません。

Lhankor_Mhy

2023/12/22 08:15

webpack とか使ってバンドルしてるのかもですね。 pdfjs-dist は、pdf.js-extract ではなくて、PDF.js をビルドしたものだと思います。 https://github.com/mozilla/pdfjs-dist

Lhankor_Mhy

2023/12/22 10:28

ちょこっと試しましたが、pdfjs-dist を読み込むまでは普通にできました。ただ、extractBuffer の中にも node.js 依存のコードがあるので、frandiox が言うようなやり方では上手くいきませんでした。もしかすると、Issue 当時とその部分のコードが変化しているのかもしれないです。

SugiuraY

2023/12/22 10:43

有難うございます、ずっと粘っていますが、まだ読み込みがうまくいかず頑張ります。それは学びとして必要としてですが、その先が難しい場合、フロントサイドのJSでpdfのテキストを抽出する他のすべも探さなくてはですね。検索すると、node.jsでサーバーサイドのケースばかりでなかなか見つからず。

Lhankor_Mhy

2023/12/23 01:04 編集

https://stackoverflow.com/questions/40635979/how-to-correctly-extract-text-from-a-pdf-using-pdf-js 当方で試したところ、こちらでテキストを抜き出すことができました（使える抜き出し方だったかどうかは別として）。ただ、バージョンの違いで多少の修正が必要でした（当方では2.6.347をダウンロード済みだったのでそちらを試しました）。たぶん JavaScript の基礎的な知識は必要だと思いますが、最新バージョンでも、https://mozilla.github.io/pdf.js/examples/index.html#interactive-examples あたりのコードと比べて修正をかけていく感じでいけると思います。

Lhankor_Mhy

2023/12/23 04:04

extractBuffer の方でも、 pdfjs-dist をローカルに置いて動作しました。

Lhankor_Mhy

2023/12/23 04:15

cdn でも extractBuffer から戻り値を得たので、回答しておきます。

行動規範の内容に同意します

回答1件

ベストアンサー

戻り値にどういう意味があるのかはよくわからないですが……

html
1<!DOCTYPE html>
2<html lang="ja">
3
4<head>
5    <meta charset="UTF-8">
6    <meta name="viewport" content="width=device-width, initial-scale=1.0">
7    <title>Document</title>
8</head>
9
10<body>
11    <script type="module">
12        import pdfjsDist from 'https://cdn.jsdelivr.net/npm/pdfjs-dist@4.0.269/+esm';
13        pdfjsLib.GlobalWorkerOptions.workerSrc = "https://cdn.jsdelivr.net/npm/pdfjs-dist@4.0.269/build/pdf.worker.min.mjs";
14
15        const pdfPath = "https://raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/examples/learning/helloworld.pdf";
16
17        function extractBuffer(buffer, options = {}, cb) {
18            if (!cb) {
19                return new Promise((resolve, reject) => {
20                    /*this.*/extractBuffer(buffer, options, (err, data) => {
21                    if (err) {
22                        reject(err);
23                    } else {
24                        resolve(data);
25                    }
26                })
27                });
28            }
29            // Loading file from file system into typed array
30            if (options.verbosity === undefined) {
31                // get rid of all warnings in nodejs usage
32                options.verbosity = -1;
33            }
34            // if (options.cMapUrl === undefined) {
35            //     options.cMapUrl = path.join(__dirname, "./cmaps/"); // trailing path delimiter is important
36            // }
37            // if (options.cMapPacked === undefined) {
38            //     options.cMapPacked = true;
39            // }
40            // if (options.CMapReaderFactory === undefined) {
41            //     options.CMapReaderFactory = LocalCMapReaderFactory;
42            // }
43            options.data = new Uint8Array(buffer);
44            const pdf = {
45                meta: {},
46                pages: []
47            };
48            // Will be using promises to load document, pages and misc data instead of callback.
49            pdfjsLib.getDocument(options).promise.then(doc => {
50                const firstPage = (options && options.firstPage) ? options.firstPage : 1;
51                const lastPage = Math.min((options && options.lastPage) ? options.lastPage : doc.numPages, doc.numPages);
52                pdf.pdfInfo = doc.pdfInfo;
53                const promises = [
54                    doc.getMetadata().then(data => {
55                        pdf.meta = { info: data.info, metadata: data.metadata ? data.metadata.getAll() || null : null };
56                    })
57                ];
58                const loadPage = pageNum => doc.getPage(pageNum).then(page => {
59                    const viewport = page.getViewport({ scale: 1.0 });
60                    const pag = {
61                        pageInfo: {
62                            num: pageNum,
63                            scale: viewport.scale,
64                            rotation: viewport.rotation,
65                            offsetX: viewport.offsetX,
66                            offsetY: viewport.offsetY,
67                            width: viewport.width,
68                            height: viewport.height
69                        }
70                    };
71                    pdf.pages.push(pag);
72                    const normalizeWhitespace = !!(options && options.normalizeWhitespace === true);
73                    const disableCombineTextItems = !!(options && options.disableCombineTextItems === true);
74                    return Promise.all([
75                        page.getAnnotations().then((annotations) => {
76                            pag.links = annotations.filter((annot) => annot.subtype === "Link" && !!annot.url)
77                                .map((link) => link.url);
78                        }),
79                        page.getTextContent({ normalizeWhitespace, disableCombineTextItems }).then((content) => {
80                            // Content contains lots of information about the text layout and styles, but we need only strings at the moment
81                            pag.content = content.items.map(item => {
82                                const tm = item.transform;
83                                let x = tm[4];
84                                let y = pag.pageInfo.height - tm[5];
85                                if (viewport.rotation === 90) {
86                                    x = tm[5];
87                                    y = tm[4];
88                                }
89                                // see https://github.com/mozilla/pdf.js/issues/8276
90                                const height = Math.sqrt(tm[2] * tm[2] + tm[3] * tm[3]);
91                                return {
92                                    x: x,
93                                    y: y,
94                                    str: item.str,
95                                    dir: item.dir,
96                                    width: item.width,
97                                    height: height,
98                                    fontName: item.fontName
99                                };
100                            });
101                        })
102                    ]).then(() => {
103                        // console.log("done page content parsing");
104                    }, (err) => {
105                        cb(err);
106                    });
107                });
108                for (let i = firstPage; i <= lastPage; i++) {
109                    promises.push(loadPage(i));
110                }
111                return Promise.all(promises);
112            }).then(() => {
113                pdf.pages.sort((a, b) => a.pageInfo.num - b.pageInfo.num);
114                cb(null, pdf);
115            }, (err) => {
116                cb(err)
117            });
118        }
119
120        const buffer = await (await fetch(pdfPath)).arrayBuffer();
121        const options = {}; /* see below */
122        extractBuffer(buffer, options, (err, data) => {
123            if (err) return console.log(err);
124            console.log(data);
125        });
126    </script>
127</body>
128
129</html>