質問編集履歴

1

pythonコードの詳細追記

2018/10/28 09:26

投稿

dendenmushi
dendenmushi

スコア98

test CHANGED
File without changes
test CHANGED
@@ -95,3 +95,209 @@
95
95
  もし何かアドバイスいい案などご掲示頂ければ幸いです。
96
96
 
97
97
  よろしくお願い致します。
98
+
99
+
100
+
101
+ ###pdf2txt.py
102
+
103
+ ```python
104
+
105
+ #!C:\ProgramData\Anaconda3\python.exe
106
+
107
+ import sys
108
+
109
+ import io
110
+
111
+ import getopt
112
+
113
+
114
+
115
+ from pdfminer.pdfinterp import PDFResourceManager, process_pdf
116
+
117
+ from pdfminer.pdfdevice import TagExtractor
118
+
119
+ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
120
+
121
+ from pdfminer.layout import LAParams
122
+
123
+ from pdfminer.utils import set_debug_logging
124
+
125
+
126
+
127
+ def main(argv):
128
+
129
+ def usage():
130
+
131
+ print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
132
+
133
+ '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
134
+
135
+ '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
136
+
137
+ return 100
138
+
139
+ try:
140
+
141
+ (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
142
+
143
+ except getopt.GetoptError:
144
+
145
+ return usage()
146
+
147
+ if not args: return usage()
148
+
149
+ debug = False
150
+
151
+ # input option
152
+
153
+ password = ''
154
+
155
+ pagenos = set()
156
+
157
+ maxpages = 0
158
+
159
+ # output option
160
+
161
+ outfile = None
162
+
163
+ outtype = None
164
+
165
+ outdir = None
166
+
167
+ layoutmode = 'normal'
168
+
169
+ codec = 'utf-8'
170
+
171
+ pageno = 1
172
+
173
+ scale = 1
174
+
175
+ caching = True
176
+
177
+ showpageno = True
178
+
179
+ laparams = LAParams()
180
+
181
+ for (k, v) in opts:
182
+
183
+ if k == '-d': debug = True
184
+
185
+ elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
186
+
187
+ elif k == '-m': maxpages = int(v)
188
+
189
+ elif k == '-P': password = v
190
+
191
+ elif k == '-o': outfile = v
192
+
193
+ elif k == '-C': caching = False
194
+
195
+ elif k == '-n': laparams = None
196
+
197
+ elif k == '-A': laparams.all_texts = True
198
+
199
+ elif k == '-V': laparams.detect_vertical = True
200
+
201
+ elif k == '-M': laparams.char_margin = float(v)
202
+
203
+ elif k == '-L': laparams.line_margin = float(v)
204
+
205
+ elif k == '-W': laparams.word_margin = float(v)
206
+
207
+ elif k == '-F': laparams.boxes_flow = float(v)
208
+
209
+ elif k == '-Y': layoutmode = v
210
+
211
+ elif k == '-O': outdir = v
212
+
213
+ elif k == '-t': outtype = v
214
+
215
+ elif k == '-c': codec = v
216
+
217
+ elif k == '-s': scale = float(v)
218
+
219
+
220
+
221
+ if debug:
222
+
223
+ set_debug_logging()
224
+
225
+ rsrcmgr = PDFResourceManager(caching=caching)
226
+
227
+ if not outtype:
228
+
229
+ outtype = 'text'
230
+
231
+ if outfile:
232
+
233
+ if outfile.endswith('.htm') or outfile.endswith('.html'):
234
+
235
+ outtype = 'html'
236
+
237
+ elif outfile.endswith('.xml'):
238
+
239
+ outtype = 'xml'
240
+
241
+ elif outfile.endswith('.tag'):
242
+
243
+ outtype = 'tag'
244
+
245
+ if outfile:
246
+
247
+ outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
248
+
249
+ close_outfp = True
250
+
251
+ else:
252
+
253
+ outfp = sys.stdout
254
+
255
+ close_outfp = False
256
+
257
+ if outtype == 'text':
258
+
259
+ device = TextConverter(rsrcmgr, outfp, laparams=laparams)
260
+
261
+ elif outtype == 'xml':
262
+
263
+ device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
264
+
265
+ elif outtype == 'html':
266
+
267
+ device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
268
+
269
+ laparams=laparams, outdir=outdir, debug=debug)
270
+
271
+ elif outtype == 'tag':
272
+
273
+ device = TagExtractor(rsrcmgr, outfp)
274
+
275
+ else:
276
+
277
+ return usage()
278
+
279
+ for fname in args:
280
+
281
+ fp = io.open(fname, 'rb')
282
+
283
+ process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
284
+
285
+ caching=caching, check_extractable=True)
286
+
287
+ fp.close()
288
+
289
+ device.close()
290
+
291
+ if close_outfp:
292
+
293
+ outfp.close()
294
+
295
+
296
+
297
+ if __name__ == '__main__':
298
+
299
+ sys.exit(main(sys.argv))
300
+
301
+
302
+
303
+ ```