質問編集履歴

2

テンプレートを使用し、必要な情報を追加しました

2019/09/23 12:33

投稿

bing
bing

スコア13

test CHANGED
File without changes
test CHANGED
@@ -2,44 +2,576 @@
2
2
 
3
3
 
4
4
 
5
+ ### 前提・実現したいこと
6
+
7
+
8
+
5
- スキャンされた書類をpythonを使用して文字を読み取りたいと考えています
9
+ スキャンされた書類(PDF)をpythonを使用して文字を読み取りたいと考えています
6
-
7
-
8
-
10
+
9
- https://blog.mudatobunka.org/entry/2016/11/19/134726
11
+ [リンク内容](https://blog.mudatobunka.org/entry/2016/11/19/134726)
12
+
13
+
14
+
10
-
15
+ 自分のしたいことのほとんどがこちらのコードで実現できそうなので
11
-
12
-
16
+
13
- を実行させていただましたが、
17
+ 勉強させていただこうと思いましたが、エラーが出てしまいます
18
+
19
+
20
+
21
+ エラーの解決策はどのようになるのでしょうか
22
+
23
+
24
+
25
+
26
+
27
+ ### 発生している問題・エラーメッセージ
28
+
29
+
30
+
31
+ ```
32
+
33
+ Traceback (most recent call last):
34
+
35
+ File "c:/Users/Master/Desktop/renamerobot-master/main.py", line 38, in <module>
36
+
37
+ images = extract_images(document)
38
+
39
+ File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 34, in extract_images
40
+
41
+ return [to_pil_image(ltImage) for ltImage in contents]
42
+
43
+ File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 53, in to_pil_image
44
+
45
+ return Image.open(buffer)
46
+
47
+ File "C:\Python27\lib\site-packages\PIL\Image.py", line 2822, in open
48
+
49
+ raise IOError("cannot identify image file %r" % (filename if filename else fp))
50
+
51
+ IOError: cannot identify image file <StringIO.StringIO instance at 0x000000000316F708>
52
+
53
+ ```
54
+
55
+
56
+
57
+ ### 該当のソースコード
58
+
59
+
14
60
 
15
61
  ```python
16
62
 
63
+ #main.py
64
+
65
+ # -*- coding: utf-8 -*-
66
+
67
+
68
+
69
+ from __future__ import print_function
70
+
71
+
72
+
73
+ from glob import glob
74
+
75
+ import re
76
+
77
+ import os
78
+
79
+ import shutil
80
+
81
+ from datetime import datetime
82
+
83
+
84
+
85
+ from renamerobot.util import load_pdf
86
+
87
+ from renamerobot.pdf import extract_images
88
+
89
+ from renamerobot.ocr import read_ordernum, read_date
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+ if not os.path.isdir('BEFORE'):
98
+
99
+ os.mkdir('BEFORE')
100
+
101
+ if not os.path.isdir('AFTER'):
102
+
103
+ os.mkdir('AFTER')
104
+
105
+ if not os.path.isdir(u'読み取り失敗'):
106
+
107
+ os.mkdir(u'読み取り失敗')
108
+
109
+
110
+
111
+
112
+
113
+ pdfs = glob('BEFORE/*.pdf')
114
+
115
+ pdfs_count = len(pdfs)
116
+
117
+ # print(pdfs)
118
+
119
+
120
+
17
- File "C:\Python27\lib\site-packages\PIL\Image.py", line 2822, in open
121
+ re_pdfname = re.compile(r'(\|/)(?P<name>.+)$')
122
+
123
+
124
+
18
-
125
+ unrenamed = []
126
+
127
+
128
+
129
+ for i, pdf in enumerate(pdfs):
130
+
131
+ print(u'{0}/{1} 読み取り中 ...'.format(i+1, pdfs_count))
132
+
133
+
134
+
135
+ document = load_pdf(pdf)
136
+
137
+
138
+
139
+ images = extract_images(document)
140
+
141
+ ordernum = read_ordernum(images)
142
+
143
+ date = read_date(images)
144
+
145
+
146
+
147
+ m = re_pdfname.search(pdf)
148
+
149
+ before = m and m.group('name')
150
+
151
+ print(u'リネーム前:', before)
152
+
153
+
154
+
155
+ if ordernum is None:
156
+
157
+ print(u' !!受注番号の読み取りに失敗', end='\n\n')
158
+
159
+ unrenamed.append(before)
160
+
161
+ shutil.copy(pdf, u'読み取り失敗/')
162
+
163
+
164
+
165
+ elif date is None:
166
+
167
+ print(u' !!日付の読み取りに失敗', end='\n\n')
168
+
169
+ unrenamed.append(before)
170
+
171
+ shutil.copy(pdf, u'読み取り失敗/')
172
+
173
+
174
+
175
+ else:
176
+
177
+ after = '{0[ordernum]}_{1[year]:0>4}{1[month]:0>2}{1[day]:0>2}.pdf'.format(ordernum, date)
178
+
179
+ print(u'リネーム後:', after, end='\n\n')
180
+
181
+
182
+
183
+ shutil.copyfile(pdf, 'AFTER/'+after)
184
+
185
+
186
+
187
+ if 0 != len(unrenamed):
188
+
189
+ nowstr = datetime.now().strftime('%Y%m%d_%H%M%S')
190
+
19
- raise IOError("cannot identify image file %r" % (filename if filename else fp))
191
+ with open(u'{1}_【{0}件のリネームできなかったファイル】.txt'.format(len(unrenamed), nowstr), 'w') as f:
20
-
192
+
21
- IOError: cannot identify image file <StringIO.StringIO instance at 0x000000000308D608>
193
+ f.write('\n'.join(unrenamed))
194
+
195
+
22
196
 
23
197
  ```
24
198
 
25
-
26
-
27
-
28
-
29
- がでて終了します
30
-
31
-
32
-
33
- 恐らく
34
-
35
199
  ```python
36
200
 
201
+ #ocr.py
202
+
203
+ # -*- coding: utf-8 -*-
204
+
205
+
206
+
207
+ from __future__ import print_function
208
+
209
+
210
+
211
+ import sys
212
+
213
+ import itertools
214
+
215
+ import re
216
+
217
+
218
+
219
+ import pyocr
220
+
221
+ import pyocr.builders
222
+
223
+ from renamerobot.util import crop
224
+
225
+
226
+
227
+ from PIL import ImageOps
228
+
229
+
230
+
231
+
232
+
233
+ tools = pyocr.get_available_tools()
234
+
235
+ if len(tools) == 0:
236
+
237
+ print("No OCR tool found")
238
+
239
+ sys.exit(1)
240
+
241
+ # The tools are returned in the recommended order of usage
242
+
243
+ tool = tools[0]
244
+
245
+
246
+
247
+
248
+
249
+ ORDERNUM_BOX = (
250
+
251
+ (0.8, 0.11, 0.95, 0.135),
252
+
253
+ (0.8, 0.08, 0.95, 0.105),
254
+
255
+ )
256
+
257
+
258
+
259
+ re_ordernum = re.compile(r'(?P<num>\d{4,})$', re.MULTILINE)
260
+
261
+
262
+
263
+ REPLACE_PAIR_1 = (
264
+
265
+ (u']', u'1'),
266
+
267
+ (u'}', u'1'),
268
+
269
+ (u'ー', u'1'),
270
+
271
+ (u'Z', u'2'),
272
+
273
+ (u'O', u'0'),
274
+
275
+ (u'〇', u'0'),
276
+
277
+ (u'I', u'1'),
278
+
279
+ (u'l', u'1'),
280
+
281
+ )
282
+
283
+
284
+
285
+ def read_ordernum(images):
286
+
287
+ for image, box in itertools.product(images, ORDERNUM_BOX):
288
+
289
+ image = crop(image, box)
290
+
291
+ image = ImageOps.grayscale(image)
292
+
293
+ # image = resize(image, height=80)
294
+
295
+ # image = erode(image)
296
+
297
+
298
+
299
+ txt = tool.image_to_string(
300
+
301
+ image,
302
+
303
+ lang='eng',
304
+
305
+ builder=pyocr.builders.TextBuilder(tesseract_layout=7)
306
+
307
+ )
308
+
309
+
310
+
311
+ for before, after in REPLACE_PAIR_1:
312
+
313
+ txt = txt.replace(before, after)
314
+
315
+ txt = re.sub(r'\s+', '', txt)
316
+
317
+
318
+
319
+ # try:
320
+
321
+ # print('OCR:')
322
+
323
+ # print(txt)
324
+
325
+ # except Exception as e:
326
+
327
+ # print(e)
328
+
329
+
330
+
331
+ result = re_ordernum.search(txt)
332
+
333
+
334
+
335
+ if result is not None:
336
+
337
+ return {
338
+
339
+ 'ordernum': result.group('num'),
340
+
341
+ }
342
+
343
+
344
+
345
+ return None
346
+
347
+
348
+
349
+
350
+
351
+ DATE_BOX = (
352
+
353
+ (0.8, 0.078, 0.95, 0.1),
354
+
355
+ (0.8, 0.06, 0.95, 0.08),
356
+
357
+ )
358
+
359
+
360
+
361
+ re_date = re.compile(u'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日', re.MULTILINE)
362
+
363
+
364
+
365
+ REPLACE_PAIR_2 = (
366
+
367
+ (u']', u'1'),
368
+
369
+ (u'}', u'1'),
370
+
371
+ (u'ー', u'1'),
372
+
373
+ (u'仔', u'年'),
374
+
375
+ (u'El', u'日'),
376
+
377
+ (u'E|', u'日'),
378
+
379
+ (u'E', u'日'),
380
+
381
+ (u'□', u'日'),
382
+
383
+ (u'口', u'日'),
384
+
385
+ (u'曰', u'日'),
386
+
387
+ (u'Z', u'2'),
388
+
389
+ (u'O', u'0'),
390
+
391
+ (u'〇', u'0'),
392
+
393
+ (u'I', u'1'),
394
+
395
+ (u'l', u'1'),
396
+
397
+ )
398
+
399
+
400
+
401
+ def read_date(images):
402
+
403
+ for image, box in itertools.product(images, DATE_BOX):
404
+
405
+ image = crop(image, box)
406
+
407
+ image = ImageOps.grayscale(image)
408
+
409
+ # image = resize(image, height=80)
410
+
411
+ # image = erode(image)
412
+
413
+
414
+
415
+ txt = tool.image_to_string(
416
+
417
+ image,
418
+
419
+ lang='jpn+eng',
420
+
421
+ builder=pyocr.builders.TextBuilder(tesseract_layout=6)
422
+
423
+ )
424
+
425
+
426
+
427
+ for before, after in REPLACE_PAIR_2:
428
+
429
+ txt = txt.replace(before, after)
430
+
431
+ txt = re.sub(r'\s+', '', txt)
432
+
433
+
434
+
435
+ # try:
436
+
437
+ # print('OCR:')
438
+
439
+ # print(txt)
440
+
441
+ # except Exception as e:
442
+
443
+ # print(e)
444
+
445
+
446
+
447
+ result = re_date.search(txt)
448
+
449
+
450
+
451
+ if result is not None:
452
+
453
+ return {
454
+
455
+ 'year': result.group('year'),
456
+
457
+ 'month': result.group('month'),
458
+
459
+ 'day': result.group('day'),
460
+
461
+ }
462
+
463
+
464
+
465
+ return None
466
+
467
+
468
+
469
+
470
+
471
+ ```
472
+
473
+ ```python
474
+
475
+ #pdf.py
476
+
477
+ # -*- coding: utf-8 -*-
478
+
479
+
480
+
481
+ from __future__ import print_function
482
+
483
+
484
+
485
+ import StringIO
486
+
487
+
488
+
489
+ from pdfminer.pdfpage import PDFPage
490
+
491
+ from pdfminer.pdfinterp import PDFResourceManager
492
+
493
+ from pdfminer.pdfinterp import PDFPageInterpreter
494
+
495
+ from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTImage, LTFigure
496
+
497
+ from pdfminer.converter import PDFPageAggregator
498
+
499
+
500
+
501
+ from PIL import Image
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+ def extract_images(document):
510
+
511
+ # Create a PDF resource manager object that stores shared resources.
512
+
513
+ rsrcmgr = PDFResourceManager()
514
+
515
+ # Create a PDF device object.
516
+
517
+ device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
518
+
519
+ # Create a PDF interpreter object.
520
+
521
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
522
+
523
+
524
+
525
+ contents = []
526
+
527
+
528
+
529
+ for page in PDFPage.create_pages(document):
530
+
531
+ interpreter.process_page(page)
532
+
533
+ layout = device.get_result()
534
+
535
+ # print(layout)
536
+
537
+
538
+
539
+ contents.extend(travarse(layout))
540
+
541
+
542
+
543
+ return [to_pil_image(ltImage) for ltImage in contents]
544
+
545
+
546
+
547
+
548
+
549
+ def travarse(layout):
550
+
551
+ images = []
552
+
553
+
554
+
555
+ for obj in layout:
556
+
557
+ if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine) or isinstance(obj, LTFigure):
558
+
559
+ images.extend(travarse(obj))
560
+
561
+
562
+
563
+ elif isinstance(obj, LTImage):
564
+
565
+ images.append(obj)
566
+
567
+
568
+
569
+ return images
570
+
571
+
572
+
37
573
  def to_pil_image(ltImage):
38
574
 
39
- """Raw Binary を Image オブジェクトに変換"""
40
-
41
-
42
-
43
575
  buffer = StringIO.StringIO()
44
576
 
45
577
  buffer.write(ltImage.stream.get_rawdata())
@@ -48,26 +580,138 @@
48
580
 
49
581
  return Image.open(buffer)
50
582
 
583
+
584
+
585
+
586
+
51
587
  ```
52
588
 
53
-
54
-
55
- の部分に問題があると思うのですが、pythonを初めて使用するため改善の方法がわかりません
56
-
57
-
58
-
59
- アドバイスをお願いします
60
-
61
-
62
-
63
- 環境;
64
-
65
- win10
66
-
67
- python 2.7.15
68
-
69
- vs code
70
-
71
-
72
-
73
- よろしくお願いします
589
+ ```python
590
+
591
+ #util.py
592
+
593
+ # -*- coding: utf-8 -*-
594
+
595
+
596
+
597
+ from __future__ import print_function
598
+
599
+
600
+
601
+ from pdfminer.pdfparser import PDFParser
602
+
603
+ from pdfminer.pdfdocument import PDFDocument
604
+
605
+ from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
606
+
607
+
608
+
609
+ # import cv2
610
+
611
+ # import numpy as np
612
+
613
+ from PIL import Image
614
+
615
+
616
+
617
+
618
+
619
+
620
+
621
+ def load_pdf(filename, password=None):
622
+
623
+ """Open a PDF file."""
624
+
625
+ fp = open(filename, 'rb')
626
+
627
+
628
+
629
+ # Create a PDF parser object associated with the file object.
630
+
631
+ parser = PDFParser(fp)
632
+
633
+
634
+
635
+ # Create a PDF document object that stores the document structure.
636
+
637
+ # Supply the password for initialization.
638
+
639
+ document = PDFDocument(parser, password)
640
+
641
+
642
+
643
+ # Check if the document allows text extraction. If not, abort.
644
+
645
+ if not document.is_extractable:
646
+
647
+ raise PDFTextExtractionNotAllowed
648
+
649
+
650
+
651
+ return document
652
+
653
+
654
+
655
+
656
+
657
+ def crop(image, ratio_box):
658
+
659
+ width, height = image.size
660
+
661
+ left, upper, right, lower = ratio_box
662
+
663
+
664
+
665
+ return image.crop((
666
+
667
+ left * width,
668
+
669
+ upper * height,
670
+
671
+ right * width,
672
+
673
+ lower * height,
674
+
675
+ ))
676
+
677
+
678
+
679
+ ```
680
+
681
+
682
+
683
+ ### 試したこと
684
+
685
+
686
+
687
+ VS CODEを使用してPYTHON 2.7.15で上記を実行しようとしました
688
+
689
+
690
+
691
+ ### 補足情報(FW/ツールのバージョンなど)
692
+
693
+ WIN 10
694
+
695
+ ライブラリ:
696
+
697
+ future==0.16.0
698
+
699
+ pdfminer==20140328
700
+
701
+ pefile==2016.3.28
702
+
703
+ Pillow==3.4.2
704
+
705
+ PyInstaller==3.2
706
+
707
+ pyocr==0.4.2
708
+
709
+ pypiwin32==219
710
+
711
+ six==1.10.0
712
+
713
+
714
+
715
+ OCR:
716
+
717
+ Tesseract

1

誤字がありましたので訂正しました

2019/09/23 12:33

投稿

bing
bing

スコア13

test CHANGED
File without changes
test CHANGED
@@ -52,7 +52,7 @@
52
52
 
53
53
 
54
54
 
55
- の部分問題がると思うのですが、pythonを初めて使用するため改善の方法がわかりません
55
+ の部分問題がると思うのですが、pythonを初めて使用するため改善の方法がわかりません
56
56
 
57
57
 
58
58
 
@@ -62,6 +62,8 @@
62
62
 
63
63
  環境;
64
64
 
65
+ win10
66
+
65
67
  python 2.7.15
66
68
 
67
69
  vs code