質問編集履歴
1
pythonコードの詳細追記
test
CHANGED
File without changes
|
test
CHANGED
@@ -95,3 +95,209 @@
|
|
95
95
|
もし何かアドバイスいい案などご掲示頂ければ幸いです。
|
96
96
|
|
97
97
|
よろしくお願い致します。
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
###pdf2txt.py
|
102
|
+
|
103
|
+
```python
|
104
|
+
|
105
|
+
#!C:\ProgramData\Anaconda3\python.exe
|
106
|
+
|
107
|
+
import sys
|
108
|
+
|
109
|
+
import io
|
110
|
+
|
111
|
+
import getopt
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
|
116
|
+
|
117
|
+
from pdfminer.pdfdevice import TagExtractor
|
118
|
+
|
119
|
+
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
120
|
+
|
121
|
+
from pdfminer.layout import LAParams
|
122
|
+
|
123
|
+
from pdfminer.utils import set_debug_logging
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
def main(argv):
|
128
|
+
|
129
|
+
def usage():
|
130
|
+
|
131
|
+
print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
|
132
|
+
|
133
|
+
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
|
134
|
+
|
135
|
+
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
|
136
|
+
|
137
|
+
return 100
|
138
|
+
|
139
|
+
try:
|
140
|
+
|
141
|
+
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
|
142
|
+
|
143
|
+
except getopt.GetoptError:
|
144
|
+
|
145
|
+
return usage()
|
146
|
+
|
147
|
+
if not args: return usage()
|
148
|
+
|
149
|
+
debug = False
|
150
|
+
|
151
|
+
# input option
|
152
|
+
|
153
|
+
password = ''
|
154
|
+
|
155
|
+
pagenos = set()
|
156
|
+
|
157
|
+
maxpages = 0
|
158
|
+
|
159
|
+
# output option
|
160
|
+
|
161
|
+
outfile = None
|
162
|
+
|
163
|
+
outtype = None
|
164
|
+
|
165
|
+
outdir = None
|
166
|
+
|
167
|
+
layoutmode = 'normal'
|
168
|
+
|
169
|
+
codec = 'utf-8'
|
170
|
+
|
171
|
+
pageno = 1
|
172
|
+
|
173
|
+
scale = 1
|
174
|
+
|
175
|
+
caching = True
|
176
|
+
|
177
|
+
showpageno = True
|
178
|
+
|
179
|
+
laparams = LAParams()
|
180
|
+
|
181
|
+
for (k, v) in opts:
|
182
|
+
|
183
|
+
if k == '-d': debug = True
|
184
|
+
|
185
|
+
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
186
|
+
|
187
|
+
elif k == '-m': maxpages = int(v)
|
188
|
+
|
189
|
+
elif k == '-P': password = v
|
190
|
+
|
191
|
+
elif k == '-o': outfile = v
|
192
|
+
|
193
|
+
elif k == '-C': caching = False
|
194
|
+
|
195
|
+
elif k == '-n': laparams = None
|
196
|
+
|
197
|
+
elif k == '-A': laparams.all_texts = True
|
198
|
+
|
199
|
+
elif k == '-V': laparams.detect_vertical = True
|
200
|
+
|
201
|
+
elif k == '-M': laparams.char_margin = float(v)
|
202
|
+
|
203
|
+
elif k == '-L': laparams.line_margin = float(v)
|
204
|
+
|
205
|
+
elif k == '-W': laparams.word_margin = float(v)
|
206
|
+
|
207
|
+
elif k == '-F': laparams.boxes_flow = float(v)
|
208
|
+
|
209
|
+
elif k == '-Y': layoutmode = v
|
210
|
+
|
211
|
+
elif k == '-O': outdir = v
|
212
|
+
|
213
|
+
elif k == '-t': outtype = v
|
214
|
+
|
215
|
+
elif k == '-c': codec = v
|
216
|
+
|
217
|
+
elif k == '-s': scale = float(v)
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
if debug:
|
222
|
+
|
223
|
+
set_debug_logging()
|
224
|
+
|
225
|
+
rsrcmgr = PDFResourceManager(caching=caching)
|
226
|
+
|
227
|
+
if not outtype:
|
228
|
+
|
229
|
+
outtype = 'text'
|
230
|
+
|
231
|
+
if outfile:
|
232
|
+
|
233
|
+
if outfile.endswith('.htm') or outfile.endswith('.html'):
|
234
|
+
|
235
|
+
outtype = 'html'
|
236
|
+
|
237
|
+
elif outfile.endswith('.xml'):
|
238
|
+
|
239
|
+
outtype = 'xml'
|
240
|
+
|
241
|
+
elif outfile.endswith('.tag'):
|
242
|
+
|
243
|
+
outtype = 'tag'
|
244
|
+
|
245
|
+
if outfile:
|
246
|
+
|
247
|
+
outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
|
248
|
+
|
249
|
+
close_outfp = True
|
250
|
+
|
251
|
+
else:
|
252
|
+
|
253
|
+
outfp = sys.stdout
|
254
|
+
|
255
|
+
close_outfp = False
|
256
|
+
|
257
|
+
if outtype == 'text':
|
258
|
+
|
259
|
+
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
|
260
|
+
|
261
|
+
elif outtype == 'xml':
|
262
|
+
|
263
|
+
device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
|
264
|
+
|
265
|
+
elif outtype == 'html':
|
266
|
+
|
267
|
+
device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
|
268
|
+
|
269
|
+
laparams=laparams, outdir=outdir, debug=debug)
|
270
|
+
|
271
|
+
elif outtype == 'tag':
|
272
|
+
|
273
|
+
device = TagExtractor(rsrcmgr, outfp)
|
274
|
+
|
275
|
+
else:
|
276
|
+
|
277
|
+
return usage()
|
278
|
+
|
279
|
+
for fname in args:
|
280
|
+
|
281
|
+
fp = io.open(fname, 'rb')
|
282
|
+
|
283
|
+
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
|
284
|
+
|
285
|
+
caching=caching, check_extractable=True)
|
286
|
+
|
287
|
+
fp.close()
|
288
|
+
|
289
|
+
device.close()
|
290
|
+
|
291
|
+
if close_outfp:
|
292
|
+
|
293
|
+
outfp.close()
|
294
|
+
|
295
|
+
|
296
|
+
|
297
|
+
if __name__ == '__main__':
|
298
|
+
|
299
|
+
sys.exit(main(sys.argv))
|
300
|
+
|
301
|
+
|
302
|
+
|
303
|
+
```
|