Pythonでpdfからの文字の読み取り

お世話になります

前提・実現したいこと

スキャンされた書類(PDF)をpythonを使用して文字を読み取りたいと考えています
リンク内容

自分のしたいことのほとんどがこちらのコードで実現できそうなので
勉強させていただこうと思いましたが、エラーが出てしまいます

エラーの解決策はどのようになるのでしょうか

発生している問題・エラーメッセージ

Traceback (most recent call last):
  File "c:/Users/Master/Desktop/renamerobot-master/main.py", line 38, in <module>
    images = extract_images(document)
  File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 34, in extract_images
    return [to_pil_image(ltImage) for ltImage in contents]
  File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 53, in to_pil_image
    return Image.open(buffer)
  File "C:\Python27\lib\site-packages\PIL\Image.py", line 2822, in open
    raise IOError("cannot identify image file %r" % (filename if filename else fp))
IOError: cannot identify image file <StringIO.StringIO instance at 0x000000000316F708>

該当のソースコード

python
1#main.py
2# -*- coding: utf-8 -*-
3
4from __future__ import print_function
5
6from glob import glob
7import re
8import os
9import shutil
10from datetime import datetime
11
12from renamerobot.util import load_pdf
13from renamerobot.pdf import extract_images
14from renamerobot.ocr import read_ordernum, read_date
15
16
17
18if not os.path.isdir('BEFORE'):
19    os.mkdir('BEFORE')
20if not os.path.isdir('AFTER'):
21    os.mkdir('AFTER')
22if not os.path.isdir(u'読み取り失敗'):
23    os.mkdir(u'読み取り失敗')
24
25
26pdfs = glob('BEFORE/*.pdf')
27pdfs_count = len(pdfs)
28# print(pdfs)
29
30re_pdfname = re.compile(r'(\|/)(?P<name>.+)$')
31
32unrenamed = []
33
34for i, pdf in enumerate(pdfs):
35    print(u'{0}/{1} 読み取り中 ...'.format(i+1, pdfs_count))
36
37    document = load_pdf(pdf)
38
39    images = extract_images(document)
40    ordernum = read_ordernum(images)
41    date = read_date(images)
42
43    m = re_pdfname.search(pdf)
44    before = m and m.group('name')
45    print(u'リネーム前:', before)
46
47    if ordernum is None:
48        print(u'  !!受注番号の読み取りに失敗', end='\n\n')
49        unrenamed.append(before)
50        shutil.copy(pdf, u'読み取り失敗/')
51
52    elif date is None:
53        print(u'  !!日付の読み取りに失敗', end='\n\n')
54        unrenamed.append(before)
55        shutil.copy(pdf, u'読み取り失敗/')
56
57    else:
58        after = '{0[ordernum]}_{1[year]:0>4}{1[month]:0>2}{1[day]:0>2}.pdf'.format(ordernum, date)
59        print(u'リネーム後:', after, end='\n\n')
60
61        shutil.copyfile(pdf, 'AFTER/'+after)
62
63if 0 != len(unrenamed):
64    nowstr = datetime.now().strftime('%Y%m%d_%H%M%S')
65    with open(u'{1}_【{0}件のリネームできなかったファイル】.txt'.format(len(unrenamed), nowstr), 'w') as f:
66        f.write('\n'.join(unrenamed))
67

python
1#ocr.py
2# -*- coding: utf-8 -*-
3
4from __future__ import print_function
5
6import sys
7import itertools
8import re
9
10import pyocr
11import pyocr.builders
12from renamerobot.util import crop
13
14from PIL import ImageOps
15
16
17tools = pyocr.get_available_tools()
18if len(tools) == 0:
19    print("No OCR tool found")
20    sys.exit(1)
21# The tools are returned in the recommended order of usage
22tool = tools[0]
23
24
25ORDERNUM_BOX = (
26    (0.8, 0.11, 0.95, 0.135),
27    (0.8, 0.08, 0.95, 0.105),
28)
29
30re_ordernum = re.compile(r'(?P<num>\d{4,})$', re.MULTILINE)
31
32REPLACE_PAIR_1 = (
33    (u']', u'1'),
34    (u'}', u'1'),
35    (u'ー', u'1'),
36    (u'Z', u'2'),
37    (u'O', u'0'),
38    (u'〇', u'0'),
39    (u'I', u'1'),
40    (u'l', u'1'),
41)
42
43def read_ordernum(images):
44    for image, box in itertools.product(images, ORDERNUM_BOX):
45        image = crop(image, box)
46        image = ImageOps.grayscale(image)
47        # image = resize(image, height=80)
48        # image = erode(image)
49
50        txt = tool.image_to_string(
51            image,
52            lang='eng',
53            builder=pyocr.builders.TextBuilder(tesseract_layout=7)
54        )
55
56        for before, after in REPLACE_PAIR_1:
57            txt = txt.replace(before, after)
58        txt = re.sub(r'\s+', '', txt)
59
60        # try:
61        #     print('OCR:')
62        #     print(txt)
63        # except Exception as e:
64        #     print(e)
65
66        result = re_ordernum.search(txt)
67
68        if result is not None:
69            return {
70                'ordernum': result.group('num'),
71            }
72
73    return None
74
75
76DATE_BOX = (
77    (0.8, 0.078, 0.95, 0.1),
78    (0.8, 0.06, 0.95, 0.08),
79)
80
81re_date = re.compile(u'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日', re.MULTILINE)
82
83REPLACE_PAIR_2 = (
84    (u']', u'1'),
85    (u'}', u'1'),
86    (u'ー', u'1'),
87    (u'仔', u'年'),
88    (u'El', u'日'),
89    (u'E|', u'日'),
90    (u'E', u'日'),
91    (u'□', u'日'),
92    (u'口', u'日'),
93    (u'曰', u'日'),
94    (u'Z', u'2'),
95    (u'O', u'0'),
96    (u'〇', u'0'),
97    (u'I', u'1'),
98    (u'l', u'1'),
99)
100
101def read_date(images):
102    for image, box in itertools.product(images, DATE_BOX):
103        image = crop(image, box)
104        image = ImageOps.grayscale(image)
105        # image = resize(image, height=80)
106        # image = erode(image)
107
108        txt = tool.image_to_string(
109            image,
110            lang='jpn+eng',
111            builder=pyocr.builders.TextBuilder(tesseract_layout=6)
112        )
113
114        for before, after in REPLACE_PAIR_2:
115            txt = txt.replace(before, after)
116        txt = re.sub(r'\s+', '', txt)
117
118        # try:
119        #     print('OCR:')
120        #     print(txt)
121        # except Exception as e:
122        #     print(e)
123
124        result = re_date.search(txt)
125
126        if result is not None:
127            return {
128                'year': result.group('year'),
129                'month': result.group('month'),
130                'day': result.group('day'),
131            }
132
133    return None
134
135

python
1#pdf.py
2# -*- coding: utf-8 -*-
3
4from __future__ import print_function
5
6import StringIO
7
8from pdfminer.pdfpage import PDFPage
9from pdfminer.pdfinterp import PDFResourceManager
10from pdfminer.pdfinterp import PDFPageInterpreter
11from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTImage,  LTFigure
12from pdfminer.converter import PDFPageAggregator
13
14from PIL import Image
15
16
17
18def extract_images(document):
19    # Create a PDF resource manager object that stores shared resources.
20    rsrcmgr = PDFResourceManager()
21    # Create a PDF device object.
22    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
23    # Create a PDF interpreter object.
24    interpreter = PDFPageInterpreter(rsrcmgr, device)
25
26    contents = []
27
28    for page in PDFPage.create_pages(document):
29        interpreter.process_page(page)
30        layout = device.get_result()
31        # print(layout)
32
33        contents.extend(travarse(layout))
34
35    return [to_pil_image(ltImage) for ltImage in contents]
36
37
38def travarse(layout):
39    images = []
40
41    for obj in layout:
42        if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine) or isinstance(obj, LTFigure):
43            images.extend(travarse(obj))
44
45        elif isinstance(obj, LTImage):
46            images.append(obj)
47
48    return images
49
50def to_pil_image(ltImage):
51    buffer = StringIO.StringIO()
52    buffer.write(ltImage.stream.get_rawdata())
53    buffer.seek(0)
54    return Image.open(buffer)
55
56

python
1#util.py
2# -*- coding: utf-8 -*-
3
4from __future__ import print_function
5
6from pdfminer.pdfparser import PDFParser
7from pdfminer.pdfdocument import PDFDocument
8from pdfminer.pdfdocument import  PDFTextExtractionNotAllowed
9
10# import cv2
11# import numpy as np
12from PIL import Image
13
14
15
16def load_pdf(filename, password=None):
17    """Open a PDF file."""
18    fp = open(filename, 'rb')
19
20    # Create a PDF parser object associated with the file object.
21    parser = PDFParser(fp)
22
23    # Create a PDF document object that stores the document structure.
24    # Supply the password for initialization.
25    document = PDFDocument(parser, password)
26
27    # Check if the document allows text extraction. If not, abort.
28    if not document.is_extractable:
29        raise PDFTextExtractionNotAllowed
30
31    return document
32
33
34def crop(image, ratio_box):
35    width, height = image.size
36    left, upper, right, lower = ratio_box
37
38    return image.crop((
39        left * width,
40        upper * height,
41        right * width,
42        lower * height,
43    ))
44

試したこと

VS CODEを使用してPYTHON 2.7.15で上記を実行しようとしました

補足情報（FW/ツールのバージョンなど）

WIN 10
ライブラリ：
future==0.16.0
pdfminer==20140328
pefile==2016.3.28
Pillow==3.4.2
PyInstaller==3.2
pyocr==0.4.2
pypiwin32==219
six==1.10.0

OCR:
Tesseract

meg_

2019/09/22 13:01

「line 2822」は上記コードのどこに当たりますか？ ※Python3系を使用された方が良いかと思いますがまた、Pythonが初めてとのことですが、参考にされたサイト等ありましたら情報を追記すると回答がつきやすいかと思います。

bing

2019/09/22 19:17

参考にしているサイトは https://blog.mudatobunka.org/entry/2016/11/19/134726 になります line 2822は恐らく読み込んだpdfの行数(?)になると思われますが違いますでしょうか python3系を使用したいのですが書かれているのがpython2系と思うので2系で実行しています