質問編集履歴

2

コード追加

2023/12/21 09:29

投稿

SugiuraY
SugiuraY

スコア318

test CHANGED
File without changes
test CHANGED
@@ -72,3 +72,132 @@
72
72
  ```
73
73
  よろしくお願い申し上げます。
74
74
 
75
+ 【追記】
76
+ 修正コード
77
+ ```html
78
+ <!--index.html-->
79
+ <!DOCTYPE html>
80
+ <html lang="ja">
81
+
82
+ <head>
83
+ <meta charset="UTF-8">
84
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
85
+ <title>Document</title>
86
+ <script src="https://cdn.jsdelivr.net/npm/pdfjs-dist@2.13.216/legacy/build/pdf.min.js"></script>
87
+ </head>
88
+
89
+ <body>
90
+ <script type="module" src="http://localhost:8080/parse.js"></script>
91
+ </body>
92
+
93
+ </html>
94
+ ```
95
+
96
+ ```JavaScript
97
+ import pdfjsLib from 'pdfjs-dist'
98
+
99
+ function extractBuffer(buffer, options = {}, cb) {
100
+ if (!cb) {
101
+ return new Promise((resolve, reject) => {
102
+ this.extractBuffer(buffer, options, (err, data) => {
103
+ if (err) {
104
+ reject(err);
105
+ } else {
106
+ resolve(data);
107
+ }
108
+ })
109
+ });
110
+ }
111
+ // Loading file from file system into typed array
112
+ if (options.verbosity === undefined) {
113
+ // get rid of all warnings in nodejs usage
114
+ options.verbosity = -1;
115
+ }
116
+ if (options.cMapUrl === undefined) {
117
+ options.cMapUrl = path.join(__dirname, "./cmaps/"); // trailing path delimiter is important
118
+ }
119
+ if (options.cMapPacked === undefined) {
120
+ options.cMapPacked = true;
121
+ }
122
+ if (options.CMapReaderFactory === undefined) {
123
+ options.CMapReaderFactory = LocalCMapReaderFactory;
124
+ }
125
+ options.data = new Uint8Array(buffer);
126
+ const pdf = {
127
+ meta: {},
128
+ pages: []
129
+ };
130
+ // Will be using promises to load document, pages and misc data instead of callback.
131
+ pdfjsLib.getDocument(options).promise.then(doc => {
132
+ const firstPage = (options && options.firstPage) ? options.firstPage : 1;
133
+ const lastPage = Math.min((options && options.lastPage) ? options.lastPage : doc.numPages, doc.numPages);
134
+ pdf.pdfInfo = doc.pdfInfo;
135
+ const promises = [
136
+ doc.getMetadata().then(data => {
137
+ pdf.meta = {info: data.info, metadata: data.metadata ? data.metadata.getAll() || null : null};
138
+ })
139
+ ];
140
+ const loadPage = pageNum => doc.getPage(pageNum).then(page => {
141
+ const viewport = page.getViewport({scale: 1.0});
142
+ const pag = {
143
+ pageInfo: {
144
+ num: pageNum,
145
+ scale: viewport.scale,
146
+ rotation: viewport.rotation,
147
+ offsetX: viewport.offsetX,
148
+ offsetY: viewport.offsetY,
149
+ width: viewport.width,
150
+ height: viewport.height
151
+ }
152
+ };
153
+ pdf.pages.push(pag);
154
+ const normalizeWhitespace = !!(options && options.normalizeWhitespace === true);
155
+ const disableCombineTextItems = !!(options && options.disableCombineTextItems === true);
156
+ return Promise.all([
157
+ page.getAnnotations().then((annotations) => {
158
+ pag.links = annotations.filter((annot) => annot.subtype === "Link" && !!annot.url)
159
+ .map((link) => link.url);
160
+ }),
161
+ page.getTextContent({normalizeWhitespace, disableCombineTextItems}).then((content) => {
162
+ // Content contains lots of information about the text layout and styles, but we need only strings at the moment
163
+ pag.content = content.items.map(item => {
164
+ const tm = item.transform;
165
+ let x = tm[4];
166
+ let y = pag.pageInfo.height - tm[5];
167
+ if (viewport.rotation === 90) {
168
+ x = tm[5];
169
+ y = tm[4];
170
+ }
171
+ // see https://github.com/mozilla/pdf.js/issues/8276
172
+ const height = Math.sqrt(tm[2] * tm[2] + tm[3] * tm[3]);
173
+ return {
174
+ x: x,
175
+ y: y,
176
+ str: item.str,
177
+ dir: item.dir,
178
+ width: item.width,
179
+ height: height,
180
+ fontName: item.fontName
181
+ };
182
+ });
183
+ })
184
+ ]).then(() => {
185
+ // console.log("done page content parsing");
186
+ }, (err) => {
187
+ cb(err);
188
+ });
189
+ });
190
+ for (let i = firstPage; i <= lastPage; i++) {
191
+ promises.push(loadPage(i));
192
+ }
193
+ return Promise.all(promises);
194
+ }).then(() => {
195
+ pdf.pages.sort((a, b) => a.pageInfo.num - b.pageInfo.num);
196
+ cb(null, pdf);
197
+ }, (err) => {
198
+ cb(err)
199
+ });
200
+ }
201
+
202
+
203
+ ```

1

package.jsonについて加筆しています

2023/12/21 06:59

投稿

SugiuraY
SugiuraY

スコア318

test CHANGED
File without changes
test CHANGED
@@ -59,5 +59,16 @@
59
59
  ```
60
60
  本質的にはJavaScriptでPDFのテキストを抽出したいというところではあるのですが、他にあまりより代替案も見つけることができず、この`browserify`をうまく突破するためのアドバイスをいただけると幸いです。
61
61
 
62
+ ### npmの状況
63
+ ```package.json
64
+ {
65
+ "dependencies": {
66
+ "browserify": "^17.0.0",
67
+ "http-server": "^14.1.1",
68
+ "pdf.js-extract": "^0.2.1",
69
+ "uniq": "^1.0.1"
70
+ }
71
+ }
72
+ ```
62
73
  よろしくお願い申し上げます。
63
74