質問編集履歴
2
テンプレートを使用し、必要な情報を追加しました
test
CHANGED
File without changes
|
test
CHANGED
@@ -2,44 +2,576 @@
|
|
2
2
|
|
3
3
|
|
4
4
|
|
5
|
+
### 前提・実現したいこと
|
6
|
+
|
7
|
+
|
8
|
+
|
5
|
-
スキャンされた書類をpythonを使用して文字を読み取りたいと考えています
|
9
|
+
スキャンされた書類(PDF)をpythonを使用して文字を読み取りたいと考えています
|
6
|
-
|
7
|
-
|
8
|
-
|
10
|
+
|
9
|
-
https://blog.mudatobunka.org/entry/2016/11/19/134726
|
11
|
+
[リンク内容](https://blog.mudatobunka.org/entry/2016/11/19/134726)
|
12
|
+
|
13
|
+
|
14
|
+
|
10
|
-
|
15
|
+
自分のしたいことのほとんどがこちらのコードで実現できそうなので
|
11
|
-
|
12
|
-
|
16
|
+
|
13
|
-
|
17
|
+
勉強させていただこうと思いましたが、エラーが出てしまいます
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
エラーの解決策はどのようになるのでしょうか
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
### 発生している問題・エラーメッセージ
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
```
|
32
|
+
|
33
|
+
Traceback (most recent call last):
|
34
|
+
|
35
|
+
File "c:/Users/Master/Desktop/renamerobot-master/main.py", line 38, in <module>
|
36
|
+
|
37
|
+
images = extract_images(document)
|
38
|
+
|
39
|
+
File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 34, in extract_images
|
40
|
+
|
41
|
+
return [to_pil_image(ltImage) for ltImage in contents]
|
42
|
+
|
43
|
+
File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 53, in to_pil_image
|
44
|
+
|
45
|
+
return Image.open(buffer)
|
46
|
+
|
47
|
+
File "C:\Python27\lib\site-packages\PIL\Image.py", line 2822, in open
|
48
|
+
|
49
|
+
raise IOError("cannot identify image file %r" % (filename if filename else fp))
|
50
|
+
|
51
|
+
IOError: cannot identify image file <StringIO.StringIO instance at 0x000000000316F708>
|
52
|
+
|
53
|
+
```
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
### 該当のソースコード
|
58
|
+
|
59
|
+
|
14
60
|
|
15
61
|
```python
|
16
62
|
|
63
|
+
#main.py
|
64
|
+
|
65
|
+
# -*- coding: utf-8 -*-
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
from __future__ import print_function
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
from glob import glob
|
74
|
+
|
75
|
+
import re
|
76
|
+
|
77
|
+
import os
|
78
|
+
|
79
|
+
import shutil
|
80
|
+
|
81
|
+
from datetime import datetime
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
from renamerobot.util import load_pdf
|
86
|
+
|
87
|
+
from renamerobot.pdf import extract_images
|
88
|
+
|
89
|
+
from renamerobot.ocr import read_ordernum, read_date
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
if not os.path.isdir('BEFORE'):
|
98
|
+
|
99
|
+
os.mkdir('BEFORE')
|
100
|
+
|
101
|
+
if not os.path.isdir('AFTER'):
|
102
|
+
|
103
|
+
os.mkdir('AFTER')
|
104
|
+
|
105
|
+
if not os.path.isdir(u'読み取り失敗'):
|
106
|
+
|
107
|
+
os.mkdir(u'読み取り失敗')
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
pdfs = glob('BEFORE/*.pdf')
|
114
|
+
|
115
|
+
pdfs_count = len(pdfs)
|
116
|
+
|
117
|
+
# print(pdfs)
|
118
|
+
|
119
|
+
|
120
|
+
|
17
|
-
|
121
|
+
re_pdfname = re.compile(r'(\|/)(?P<name>.+)$')
|
122
|
+
|
123
|
+
|
124
|
+
|
18
|
-
|
125
|
+
unrenamed = []
|
126
|
+
|
127
|
+
|
128
|
+
|
129
|
+
for i, pdf in enumerate(pdfs):
|
130
|
+
|
131
|
+
print(u'{0}/{1} 読み取り中 ...'.format(i+1, pdfs_count))
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
document = load_pdf(pdf)
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
images = extract_images(document)
|
140
|
+
|
141
|
+
ordernum = read_ordernum(images)
|
142
|
+
|
143
|
+
date = read_date(images)
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
m = re_pdfname.search(pdf)
|
148
|
+
|
149
|
+
before = m and m.group('name')
|
150
|
+
|
151
|
+
print(u'リネーム前:', before)
|
152
|
+
|
153
|
+
|
154
|
+
|
155
|
+
if ordernum is None:
|
156
|
+
|
157
|
+
print(u' !!受注番号の読み取りに失敗', end='\n\n')
|
158
|
+
|
159
|
+
unrenamed.append(before)
|
160
|
+
|
161
|
+
shutil.copy(pdf, u'読み取り失敗/')
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
elif date is None:
|
166
|
+
|
167
|
+
print(u' !!日付の読み取りに失敗', end='\n\n')
|
168
|
+
|
169
|
+
unrenamed.append(before)
|
170
|
+
|
171
|
+
shutil.copy(pdf, u'読み取り失敗/')
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
else:
|
176
|
+
|
177
|
+
after = '{0[ordernum]}_{1[year]:0>4}{1[month]:0>2}{1[day]:0>2}.pdf'.format(ordernum, date)
|
178
|
+
|
179
|
+
print(u'リネーム後:', after, end='\n\n')
|
180
|
+
|
181
|
+
|
182
|
+
|
183
|
+
shutil.copyfile(pdf, 'AFTER/'+after)
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
if 0 != len(unrenamed):
|
188
|
+
|
189
|
+
nowstr = datetime.now().strftime('%Y%m%d_%H%M%S')
|
190
|
+
|
19
|
-
|
191
|
+
with open(u'{1}_【{0}件のリネームできなかったファイル】.txt'.format(len(unrenamed), nowstr), 'w') as f:
|
20
|
-
|
192
|
+
|
21
|
-
|
193
|
+
f.write('\n'.join(unrenamed))
|
194
|
+
|
195
|
+
|
22
196
|
|
23
197
|
```
|
24
198
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
がでて終了します
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
恐らく
|
34
|
-
|
35
199
|
```python
|
36
200
|
|
201
|
+
#ocr.py
|
202
|
+
|
203
|
+
# -*- coding: utf-8 -*-
|
204
|
+
|
205
|
+
|
206
|
+
|
207
|
+
from __future__ import print_function
|
208
|
+
|
209
|
+
|
210
|
+
|
211
|
+
import sys
|
212
|
+
|
213
|
+
import itertools
|
214
|
+
|
215
|
+
import re
|
216
|
+
|
217
|
+
|
218
|
+
|
219
|
+
import pyocr
|
220
|
+
|
221
|
+
import pyocr.builders
|
222
|
+
|
223
|
+
from renamerobot.util import crop
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
from PIL import ImageOps
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
tools = pyocr.get_available_tools()
|
234
|
+
|
235
|
+
if len(tools) == 0:
|
236
|
+
|
237
|
+
print("No OCR tool found")
|
238
|
+
|
239
|
+
sys.exit(1)
|
240
|
+
|
241
|
+
# The tools are returned in the recommended order of usage
|
242
|
+
|
243
|
+
tool = tools[0]
|
244
|
+
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
|
249
|
+
ORDERNUM_BOX = (
|
250
|
+
|
251
|
+
(0.8, 0.11, 0.95, 0.135),
|
252
|
+
|
253
|
+
(0.8, 0.08, 0.95, 0.105),
|
254
|
+
|
255
|
+
)
|
256
|
+
|
257
|
+
|
258
|
+
|
259
|
+
re_ordernum = re.compile(r'(?P<num>\d{4,})$', re.MULTILINE)
|
260
|
+
|
261
|
+
|
262
|
+
|
263
|
+
REPLACE_PAIR_1 = (
|
264
|
+
|
265
|
+
(u']', u'1'),
|
266
|
+
|
267
|
+
(u'}', u'1'),
|
268
|
+
|
269
|
+
(u'ー', u'1'),
|
270
|
+
|
271
|
+
(u'Z', u'2'),
|
272
|
+
|
273
|
+
(u'O', u'0'),
|
274
|
+
|
275
|
+
(u'〇', u'0'),
|
276
|
+
|
277
|
+
(u'I', u'1'),
|
278
|
+
|
279
|
+
(u'l', u'1'),
|
280
|
+
|
281
|
+
)
|
282
|
+
|
283
|
+
|
284
|
+
|
285
|
+
def read_ordernum(images):
|
286
|
+
|
287
|
+
for image, box in itertools.product(images, ORDERNUM_BOX):
|
288
|
+
|
289
|
+
image = crop(image, box)
|
290
|
+
|
291
|
+
image = ImageOps.grayscale(image)
|
292
|
+
|
293
|
+
# image = resize(image, height=80)
|
294
|
+
|
295
|
+
# image = erode(image)
|
296
|
+
|
297
|
+
|
298
|
+
|
299
|
+
txt = tool.image_to_string(
|
300
|
+
|
301
|
+
image,
|
302
|
+
|
303
|
+
lang='eng',
|
304
|
+
|
305
|
+
builder=pyocr.builders.TextBuilder(tesseract_layout=7)
|
306
|
+
|
307
|
+
)
|
308
|
+
|
309
|
+
|
310
|
+
|
311
|
+
for before, after in REPLACE_PAIR_1:
|
312
|
+
|
313
|
+
txt = txt.replace(before, after)
|
314
|
+
|
315
|
+
txt = re.sub(r'\s+', '', txt)
|
316
|
+
|
317
|
+
|
318
|
+
|
319
|
+
# try:
|
320
|
+
|
321
|
+
# print('OCR:')
|
322
|
+
|
323
|
+
# print(txt)
|
324
|
+
|
325
|
+
# except Exception as e:
|
326
|
+
|
327
|
+
# print(e)
|
328
|
+
|
329
|
+
|
330
|
+
|
331
|
+
result = re_ordernum.search(txt)
|
332
|
+
|
333
|
+
|
334
|
+
|
335
|
+
if result is not None:
|
336
|
+
|
337
|
+
return {
|
338
|
+
|
339
|
+
'ordernum': result.group('num'),
|
340
|
+
|
341
|
+
}
|
342
|
+
|
343
|
+
|
344
|
+
|
345
|
+
return None
|
346
|
+
|
347
|
+
|
348
|
+
|
349
|
+
|
350
|
+
|
351
|
+
DATE_BOX = (
|
352
|
+
|
353
|
+
(0.8, 0.078, 0.95, 0.1),
|
354
|
+
|
355
|
+
(0.8, 0.06, 0.95, 0.08),
|
356
|
+
|
357
|
+
)
|
358
|
+
|
359
|
+
|
360
|
+
|
361
|
+
re_date = re.compile(u'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日', re.MULTILINE)
|
362
|
+
|
363
|
+
|
364
|
+
|
365
|
+
REPLACE_PAIR_2 = (
|
366
|
+
|
367
|
+
(u']', u'1'),
|
368
|
+
|
369
|
+
(u'}', u'1'),
|
370
|
+
|
371
|
+
(u'ー', u'1'),
|
372
|
+
|
373
|
+
(u'仔', u'年'),
|
374
|
+
|
375
|
+
(u'El', u'日'),
|
376
|
+
|
377
|
+
(u'E|', u'日'),
|
378
|
+
|
379
|
+
(u'E', u'日'),
|
380
|
+
|
381
|
+
(u'□', u'日'),
|
382
|
+
|
383
|
+
(u'口', u'日'),
|
384
|
+
|
385
|
+
(u'曰', u'日'),
|
386
|
+
|
387
|
+
(u'Z', u'2'),
|
388
|
+
|
389
|
+
(u'O', u'0'),
|
390
|
+
|
391
|
+
(u'〇', u'0'),
|
392
|
+
|
393
|
+
(u'I', u'1'),
|
394
|
+
|
395
|
+
(u'l', u'1'),
|
396
|
+
|
397
|
+
)
|
398
|
+
|
399
|
+
|
400
|
+
|
401
|
+
def read_date(images):
|
402
|
+
|
403
|
+
for image, box in itertools.product(images, DATE_BOX):
|
404
|
+
|
405
|
+
image = crop(image, box)
|
406
|
+
|
407
|
+
image = ImageOps.grayscale(image)
|
408
|
+
|
409
|
+
# image = resize(image, height=80)
|
410
|
+
|
411
|
+
# image = erode(image)
|
412
|
+
|
413
|
+
|
414
|
+
|
415
|
+
txt = tool.image_to_string(
|
416
|
+
|
417
|
+
image,
|
418
|
+
|
419
|
+
lang='jpn+eng',
|
420
|
+
|
421
|
+
builder=pyocr.builders.TextBuilder(tesseract_layout=6)
|
422
|
+
|
423
|
+
)
|
424
|
+
|
425
|
+
|
426
|
+
|
427
|
+
for before, after in REPLACE_PAIR_2:
|
428
|
+
|
429
|
+
txt = txt.replace(before, after)
|
430
|
+
|
431
|
+
txt = re.sub(r'\s+', '', txt)
|
432
|
+
|
433
|
+
|
434
|
+
|
435
|
+
# try:
|
436
|
+
|
437
|
+
# print('OCR:')
|
438
|
+
|
439
|
+
# print(txt)
|
440
|
+
|
441
|
+
# except Exception as e:
|
442
|
+
|
443
|
+
# print(e)
|
444
|
+
|
445
|
+
|
446
|
+
|
447
|
+
result = re_date.search(txt)
|
448
|
+
|
449
|
+
|
450
|
+
|
451
|
+
if result is not None:
|
452
|
+
|
453
|
+
return {
|
454
|
+
|
455
|
+
'year': result.group('year'),
|
456
|
+
|
457
|
+
'month': result.group('month'),
|
458
|
+
|
459
|
+
'day': result.group('day'),
|
460
|
+
|
461
|
+
}
|
462
|
+
|
463
|
+
|
464
|
+
|
465
|
+
return None
|
466
|
+
|
467
|
+
|
468
|
+
|
469
|
+
|
470
|
+
|
471
|
+
```
|
472
|
+
|
473
|
+
```python
|
474
|
+
|
475
|
+
#pdf.py
|
476
|
+
|
477
|
+
# -*- coding: utf-8 -*-
|
478
|
+
|
479
|
+
|
480
|
+
|
481
|
+
from __future__ import print_function
|
482
|
+
|
483
|
+
|
484
|
+
|
485
|
+
import StringIO
|
486
|
+
|
487
|
+
|
488
|
+
|
489
|
+
from pdfminer.pdfpage import PDFPage
|
490
|
+
|
491
|
+
from pdfminer.pdfinterp import PDFResourceManager
|
492
|
+
|
493
|
+
from pdfminer.pdfinterp import PDFPageInterpreter
|
494
|
+
|
495
|
+
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTImage, LTFigure
|
496
|
+
|
497
|
+
from pdfminer.converter import PDFPageAggregator
|
498
|
+
|
499
|
+
|
500
|
+
|
501
|
+
from PIL import Image
|
502
|
+
|
503
|
+
|
504
|
+
|
505
|
+
|
506
|
+
|
507
|
+
|
508
|
+
|
509
|
+
def extract_images(document):
|
510
|
+
|
511
|
+
# Create a PDF resource manager object that stores shared resources.
|
512
|
+
|
513
|
+
rsrcmgr = PDFResourceManager()
|
514
|
+
|
515
|
+
# Create a PDF device object.
|
516
|
+
|
517
|
+
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
|
518
|
+
|
519
|
+
# Create a PDF interpreter object.
|
520
|
+
|
521
|
+
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
522
|
+
|
523
|
+
|
524
|
+
|
525
|
+
contents = []
|
526
|
+
|
527
|
+
|
528
|
+
|
529
|
+
for page in PDFPage.create_pages(document):
|
530
|
+
|
531
|
+
interpreter.process_page(page)
|
532
|
+
|
533
|
+
layout = device.get_result()
|
534
|
+
|
535
|
+
# print(layout)
|
536
|
+
|
537
|
+
|
538
|
+
|
539
|
+
contents.extend(travarse(layout))
|
540
|
+
|
541
|
+
|
542
|
+
|
543
|
+
return [to_pil_image(ltImage) for ltImage in contents]
|
544
|
+
|
545
|
+
|
546
|
+
|
547
|
+
|
548
|
+
|
549
|
+
def travarse(layout):
|
550
|
+
|
551
|
+
images = []
|
552
|
+
|
553
|
+
|
554
|
+
|
555
|
+
for obj in layout:
|
556
|
+
|
557
|
+
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine) or isinstance(obj, LTFigure):
|
558
|
+
|
559
|
+
images.extend(travarse(obj))
|
560
|
+
|
561
|
+
|
562
|
+
|
563
|
+
elif isinstance(obj, LTImage):
|
564
|
+
|
565
|
+
images.append(obj)
|
566
|
+
|
567
|
+
|
568
|
+
|
569
|
+
return images
|
570
|
+
|
571
|
+
|
572
|
+
|
37
573
|
def to_pil_image(ltImage):
|
38
574
|
|
39
|
-
"""Raw Binary を Image オブジェクトに変換"""
|
40
|
-
|
41
|
-
|
42
|
-
|
43
575
|
buffer = StringIO.StringIO()
|
44
576
|
|
45
577
|
buffer.write(ltImage.stream.get_rawdata())
|
@@ -48,26 +580,138 @@
|
|
48
580
|
|
49
581
|
return Image.open(buffer)
|
50
582
|
|
583
|
+
|
584
|
+
|
585
|
+
|
586
|
+
|
51
587
|
```
|
52
588
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
p
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
589
|
+
```python
|
590
|
+
|
591
|
+
#util.py
|
592
|
+
|
593
|
+
# -*- coding: utf-8 -*-
|
594
|
+
|
595
|
+
|
596
|
+
|
597
|
+
from __future__ import print_function
|
598
|
+
|
599
|
+
|
600
|
+
|
601
|
+
from pdfminer.pdfparser import PDFParser
|
602
|
+
|
603
|
+
from pdfminer.pdfdocument import PDFDocument
|
604
|
+
|
605
|
+
from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
|
606
|
+
|
607
|
+
|
608
|
+
|
609
|
+
# import cv2
|
610
|
+
|
611
|
+
# import numpy as np
|
612
|
+
|
613
|
+
from PIL import Image
|
614
|
+
|
615
|
+
|
616
|
+
|
617
|
+
|
618
|
+
|
619
|
+
|
620
|
+
|
621
|
+
def load_pdf(filename, password=None):
|
622
|
+
|
623
|
+
"""Open a PDF file."""
|
624
|
+
|
625
|
+
fp = open(filename, 'rb')
|
626
|
+
|
627
|
+
|
628
|
+
|
629
|
+
# Create a PDF parser object associated with the file object.
|
630
|
+
|
631
|
+
parser = PDFParser(fp)
|
632
|
+
|
633
|
+
|
634
|
+
|
635
|
+
# Create a PDF document object that stores the document structure.
|
636
|
+
|
637
|
+
# Supply the password for initialization.
|
638
|
+
|
639
|
+
document = PDFDocument(parser, password)
|
640
|
+
|
641
|
+
|
642
|
+
|
643
|
+
# Check if the document allows text extraction. If not, abort.
|
644
|
+
|
645
|
+
if not document.is_extractable:
|
646
|
+
|
647
|
+
raise PDFTextExtractionNotAllowed
|
648
|
+
|
649
|
+
|
650
|
+
|
651
|
+
return document
|
652
|
+
|
653
|
+
|
654
|
+
|
655
|
+
|
656
|
+
|
657
|
+
def crop(image, ratio_box):
|
658
|
+
|
659
|
+
width, height = image.size
|
660
|
+
|
661
|
+
left, upper, right, lower = ratio_box
|
662
|
+
|
663
|
+
|
664
|
+
|
665
|
+
return image.crop((
|
666
|
+
|
667
|
+
left * width,
|
668
|
+
|
669
|
+
upper * height,
|
670
|
+
|
671
|
+
right * width,
|
672
|
+
|
673
|
+
lower * height,
|
674
|
+
|
675
|
+
))
|
676
|
+
|
677
|
+
|
678
|
+
|
679
|
+
```
|
680
|
+
|
681
|
+
|
682
|
+
|
683
|
+
### 試したこと
|
684
|
+
|
685
|
+
|
686
|
+
|
687
|
+
VS CODEを使用してPYTHON 2.7.15で上記を実行しようとしました
|
688
|
+
|
689
|
+
|
690
|
+
|
691
|
+
### 補足情報(FW/ツールのバージョンなど)
|
692
|
+
|
693
|
+
WIN 10
|
694
|
+
|
695
|
+
ライブラリ:
|
696
|
+
|
697
|
+
future==0.16.0
|
698
|
+
|
699
|
+
pdfminer==20140328
|
700
|
+
|
701
|
+
pefile==2016.3.28
|
702
|
+
|
703
|
+
Pillow==3.4.2
|
704
|
+
|
705
|
+
PyInstaller==3.2
|
706
|
+
|
707
|
+
pyocr==0.4.2
|
708
|
+
|
709
|
+
pypiwin32==219
|
710
|
+
|
711
|
+
six==1.10.0
|
712
|
+
|
713
|
+
|
714
|
+
|
715
|
+
OCR:
|
716
|
+
|
717
|
+
Tesseract
|
1
誤字がありましたので訂正しました
test
CHANGED
File without changes
|
test
CHANGED
@@ -52,7 +52,7 @@
|
|
52
52
|
|
53
53
|
|
54
54
|
|
55
|
-
の部分
|
55
|
+
の部分に問題があると思うのですが、pythonを初めて使用するため改善の方法がわかりません
|
56
56
|
|
57
57
|
|
58
58
|
|
@@ -62,6 +62,8 @@
|
|
62
62
|
|
63
63
|
環境;
|
64
64
|
|
65
|
+
win10
|
66
|
+
|
65
67
|
python 2.7.15
|
66
68
|
|
67
69
|
vs code
|