質問編集履歴
2
コード追加
test
CHANGED
File without changes
|
test
CHANGED
@@ -72,3 +72,132 @@
|
|
72
72
|
```
|
73
73
|
よろしくお願い申し上げます。
|
74
74
|
|
75
|
+
【追記】
|
76
|
+
修正コード
|
77
|
+
```html
|
78
|
+
<!--index.html-->
|
79
|
+
<!DOCTYPE html>
|
80
|
+
<html lang="ja">
|
81
|
+
|
82
|
+
<head>
|
83
|
+
<meta charset="UTF-8">
|
84
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
85
|
+
<title>Document</title>
|
86
|
+
<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist@2.13.216/legacy/build/pdf.min.js"></script>
|
87
|
+
</head>
|
88
|
+
|
89
|
+
<body>
|
90
|
+
<script type="module" src="http://localhost:8080/parse.js"></script>
|
91
|
+
</body>
|
92
|
+
|
93
|
+
</html>
|
94
|
+
```
|
95
|
+
|
96
|
+
```JavaScript
|
97
|
+
import pdfjsLib from 'pdfjs-dist'
|
98
|
+
|
99
|
+
function extractBuffer(buffer, options = {}, cb) {
|
100
|
+
if (!cb) {
|
101
|
+
return new Promise((resolve, reject) => {
|
102
|
+
this.extractBuffer(buffer, options, (err, data) => {
|
103
|
+
if (err) {
|
104
|
+
reject(err);
|
105
|
+
} else {
|
106
|
+
resolve(data);
|
107
|
+
}
|
108
|
+
})
|
109
|
+
});
|
110
|
+
}
|
111
|
+
// Loading file from file system into typed array
|
112
|
+
if (options.verbosity === undefined) {
|
113
|
+
// get rid of all warnings in nodejs usage
|
114
|
+
options.verbosity = -1;
|
115
|
+
}
|
116
|
+
if (options.cMapUrl === undefined) {
|
117
|
+
options.cMapUrl = path.join(__dirname, "./cmaps/"); // trailing path delimiter is important
|
118
|
+
}
|
119
|
+
if (options.cMapPacked === undefined) {
|
120
|
+
options.cMapPacked = true;
|
121
|
+
}
|
122
|
+
if (options.CMapReaderFactory === undefined) {
|
123
|
+
options.CMapReaderFactory = LocalCMapReaderFactory;
|
124
|
+
}
|
125
|
+
options.data = new Uint8Array(buffer);
|
126
|
+
const pdf = {
|
127
|
+
meta: {},
|
128
|
+
pages: []
|
129
|
+
};
|
130
|
+
// Will be using promises to load document, pages and misc data instead of callback.
|
131
|
+
pdfjsLib.getDocument(options).promise.then(doc => {
|
132
|
+
const firstPage = (options && options.firstPage) ? options.firstPage : 1;
|
133
|
+
const lastPage = Math.min((options && options.lastPage) ? options.lastPage : doc.numPages, doc.numPages);
|
134
|
+
pdf.pdfInfo = doc.pdfInfo;
|
135
|
+
const promises = [
|
136
|
+
doc.getMetadata().then(data => {
|
137
|
+
pdf.meta = {info: data.info, metadata: data.metadata ? data.metadata.getAll() || null : null};
|
138
|
+
})
|
139
|
+
];
|
140
|
+
const loadPage = pageNum => doc.getPage(pageNum).then(page => {
|
141
|
+
const viewport = page.getViewport({scale: 1.0});
|
142
|
+
const pag = {
|
143
|
+
pageInfo: {
|
144
|
+
num: pageNum,
|
145
|
+
scale: viewport.scale,
|
146
|
+
rotation: viewport.rotation,
|
147
|
+
offsetX: viewport.offsetX,
|
148
|
+
offsetY: viewport.offsetY,
|
149
|
+
width: viewport.width,
|
150
|
+
height: viewport.height
|
151
|
+
}
|
152
|
+
};
|
153
|
+
pdf.pages.push(pag);
|
154
|
+
const normalizeWhitespace = !!(options && options.normalizeWhitespace === true);
|
155
|
+
const disableCombineTextItems = !!(options && options.disableCombineTextItems === true);
|
156
|
+
return Promise.all([
|
157
|
+
page.getAnnotations().then((annotations) => {
|
158
|
+
pag.links = annotations.filter((annot) => annot.subtype === "Link" && !!annot.url)
|
159
|
+
.map((link) => link.url);
|
160
|
+
}),
|
161
|
+
page.getTextContent({normalizeWhitespace, disableCombineTextItems}).then((content) => {
|
162
|
+
// Content contains lots of information about the text layout and styles, but we need only strings at the moment
|
163
|
+
pag.content = content.items.map(item => {
|
164
|
+
const tm = item.transform;
|
165
|
+
let x = tm[4];
|
166
|
+
let y = pag.pageInfo.height - tm[5];
|
167
|
+
if (viewport.rotation === 90) {
|
168
|
+
x = tm[5];
|
169
|
+
y = tm[4];
|
170
|
+
}
|
171
|
+
// see https://github.com/mozilla/pdf.js/issues/8276
|
172
|
+
const height = Math.sqrt(tm[2] * tm[2] + tm[3] * tm[3]);
|
173
|
+
return {
|
174
|
+
x: x,
|
175
|
+
y: y,
|
176
|
+
str: item.str,
|
177
|
+
dir: item.dir,
|
178
|
+
width: item.width,
|
179
|
+
height: height,
|
180
|
+
fontName: item.fontName
|
181
|
+
};
|
182
|
+
});
|
183
|
+
})
|
184
|
+
]).then(() => {
|
185
|
+
// console.log("done page content parsing");
|
186
|
+
}, (err) => {
|
187
|
+
cb(err);
|
188
|
+
});
|
189
|
+
});
|
190
|
+
for (let i = firstPage; i <= lastPage; i++) {
|
191
|
+
promises.push(loadPage(i));
|
192
|
+
}
|
193
|
+
return Promise.all(promises);
|
194
|
+
}).then(() => {
|
195
|
+
pdf.pages.sort((a, b) => a.pageInfo.num - b.pageInfo.num);
|
196
|
+
cb(null, pdf);
|
197
|
+
}, (err) => {
|
198
|
+
cb(err)
|
199
|
+
});
|
200
|
+
}
|
201
|
+
|
202
|
+
|
203
|
+
```
|
1
package.jsonについて加筆しています
test
CHANGED
File without changes
|
test
CHANGED
@@ -59,5 +59,16 @@
|
|
59
59
|
```
|
60
60
|
本質的にはJavaScriptでPDFのテキストを抽出したいというところではあるのですが、他にあまりより代替案も見つけることができず、この`browserify`をうまく突破するためのアドバイスをいただけると幸いです。
|
61
61
|
|
62
|
+
### npmの状況
|
63
|
+
```package.json
|
64
|
+
{
|
65
|
+
"dependencies": {
|
66
|
+
"browserify": "^17.0.0",
|
67
|
+
"http-server": "^14.1.1",
|
68
|
+
"pdf.js-extract": "^0.2.1",
|
69
|
+
"uniq": "^1.0.1"
|
70
|
+
}
|
71
|
+
}
|
72
|
+
```
|
62
73
|
よろしくお願い申し上げます。
|
63
74
|
|