お世話になります
前提・実現したいこと
スキャンされた書類(PDF)をpythonを使用して文字を読み取りたいと考えています
リンク内容
自分のしたいことのほとんどがこちらのコードで実現できそうなので
勉強させていただこうと思いましたが、エラーが出てしまいます
エラーの解決策はどのようになるのでしょうか
発生している問題・エラーメッセージ
Traceback (most recent call last): File "c:/Users/Master/Desktop/renamerobot-master/main.py", line 38, in <module> images = extract_images(document) File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 34, in extract_images return [to_pil_image(ltImage) for ltImage in contents] File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 53, in to_pil_image return Image.open(buffer) File "C:\Python27\lib\site-packages\PIL\Image.py", line 2822, in open raise IOError("cannot identify image file %r" % (filename if filename else fp)) IOError: cannot identify image file <StringIO.StringIO instance at 0x000000000316F708>
該当のソースコード
python
1#main.py 2# -*- coding: utf-8 -*- 3 4from __future__ import print_function 5 6from glob import glob 7import re 8import os 9import shutil 10from datetime import datetime 11 12from renamerobot.util import load_pdf 13from renamerobot.pdf import extract_images 14from renamerobot.ocr import read_ordernum, read_date 15 16 17 18if not os.path.isdir('BEFORE'): 19 os.mkdir('BEFORE') 20if not os.path.isdir('AFTER'): 21 os.mkdir('AFTER') 22if not os.path.isdir(u'読み取り失敗'): 23 os.mkdir(u'読み取り失敗') 24 25 26pdfs = glob('BEFORE/*.pdf') 27pdfs_count = len(pdfs) 28# print(pdfs) 29 30re_pdfname = re.compile(r'(\|/)(?P<name>.+)$') 31 32unrenamed = [] 33 34for i, pdf in enumerate(pdfs): 35 print(u'{0}/{1} 読み取り中 ...'.format(i+1, pdfs_count)) 36 37 document = load_pdf(pdf) 38 39 images = extract_images(document) 40 ordernum = read_ordernum(images) 41 date = read_date(images) 42 43 m = re_pdfname.search(pdf) 44 before = m and m.group('name') 45 print(u'リネーム前:', before) 46 47 if ordernum is None: 48 print(u' !!受注番号の読み取りに失敗', end='\n\n') 49 unrenamed.append(before) 50 shutil.copy(pdf, u'読み取り失敗/') 51 52 elif date is None: 53 print(u' !!日付の読み取りに失敗', end='\n\n') 54 unrenamed.append(before) 55 shutil.copy(pdf, u'読み取り失敗/') 56 57 else: 58 after = '{0[ordernum]}_{1[year]:0>4}{1[month]:0>2}{1[day]:0>2}.pdf'.format(ordernum, date) 59 print(u'リネーム後:', after, end='\n\n') 60 61 shutil.copyfile(pdf, 'AFTER/'+after) 62 63if 0 != len(unrenamed): 64 nowstr = datetime.now().strftime('%Y%m%d_%H%M%S') 65 with open(u'{1}_【{0}件のリネームできなかったファイル】.txt'.format(len(unrenamed), nowstr), 'w') as f: 66 f.write('\n'.join(unrenamed)) 67
python
1#ocr.py 2# -*- coding: utf-8 -*- 3 4from __future__ import print_function 5 6import sys 7import itertools 8import re 9 10import pyocr 11import pyocr.builders 12from renamerobot.util import crop 13 14from PIL import ImageOps 15 16 17tools = pyocr.get_available_tools() 18if len(tools) == 0: 19 print("No OCR tool found") 20 sys.exit(1) 21# The tools are returned in the recommended order of usage 22tool = tools[0] 23 24 25ORDERNUM_BOX = ( 26 (0.8, 0.11, 0.95, 0.135), 27 (0.8, 0.08, 0.95, 0.105), 28) 29 30re_ordernum = re.compile(r'(?P<num>\d{4,})$', re.MULTILINE) 31 32REPLACE_PAIR_1 = ( 33 (u']', u'1'), 34 (u'}', u'1'), 35 (u'ー', u'1'), 36 (u'Z', u'2'), 37 (u'O', u'0'), 38 (u'〇', u'0'), 39 (u'I', u'1'), 40 (u'l', u'1'), 41) 42 43def read_ordernum(images): 44 for image, box in itertools.product(images, ORDERNUM_BOX): 45 image = crop(image, box) 46 image = ImageOps.grayscale(image) 47 # image = resize(image, height=80) 48 # image = erode(image) 49 50 txt = tool.image_to_string( 51 image, 52 lang='eng', 53 builder=pyocr.builders.TextBuilder(tesseract_layout=7) 54 ) 55 56 for before, after in REPLACE_PAIR_1: 57 txt = txt.replace(before, after) 58 txt = re.sub(r'\s+', '', txt) 59 60 # try: 61 # print('OCR:') 62 # print(txt) 63 # except Exception as e: 64 # print(e) 65 66 result = re_ordernum.search(txt) 67 68 if result is not None: 69 return { 70 'ordernum': result.group('num'), 71 } 72 73 return None 74 75 76DATE_BOX = ( 77 (0.8, 0.078, 0.95, 0.1), 78 (0.8, 0.06, 0.95, 0.08), 79) 80 81re_date = re.compile(u'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日', re.MULTILINE) 82 83REPLACE_PAIR_2 = ( 84 (u']', u'1'), 85 (u'}', u'1'), 86 (u'ー', u'1'), 87 (u'仔', u'年'), 88 (u'El', u'日'), 89 (u'E|', u'日'), 90 (u'E', u'日'), 91 (u'□', u'日'), 92 (u'口', u'日'), 93 (u'曰', u'日'), 94 (u'Z', u'2'), 95 (u'O', u'0'), 96 (u'〇', u'0'), 97 (u'I', u'1'), 98 (u'l', u'1'), 99) 100 101def read_date(images): 102 for image, box in itertools.product(images, DATE_BOX): 103 image = crop(image, box) 104 image = ImageOps.grayscale(image) 105 # image = resize(image, height=80) 106 # image = erode(image) 107 108 txt = tool.image_to_string( 109 image, 110 lang='jpn+eng', 111 builder=pyocr.builders.TextBuilder(tesseract_layout=6) 112 ) 113 114 for before, after in REPLACE_PAIR_2: 115 txt = txt.replace(before, after) 116 txt = re.sub(r'\s+', '', txt) 117 118 # try: 119 # print('OCR:') 120 # print(txt) 121 # except Exception as e: 122 # print(e) 123 124 result = re_date.search(txt) 125 126 if result is not None: 127 return { 128 'year': result.group('year'), 129 'month': result.group('month'), 130 'day': result.group('day'), 131 } 132 133 return None 134 135
python
1#pdf.py 2# -*- coding: utf-8 -*- 3 4from __future__ import print_function 5 6import StringIO 7 8from pdfminer.pdfpage import PDFPage 9from pdfminer.pdfinterp import PDFResourceManager 10from pdfminer.pdfinterp import PDFPageInterpreter 11from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTImage, LTFigure 12from pdfminer.converter import PDFPageAggregator 13 14from PIL import Image 15 16 17 18def extract_images(document): 19 # Create a PDF resource manager object that stores shared resources. 20 rsrcmgr = PDFResourceManager() 21 # Create a PDF device object. 22 device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) 23 # Create a PDF interpreter object. 24 interpreter = PDFPageInterpreter(rsrcmgr, device) 25 26 contents = [] 27 28 for page in PDFPage.create_pages(document): 29 interpreter.process_page(page) 30 layout = device.get_result() 31 # print(layout) 32 33 contents.extend(travarse(layout)) 34 35 return [to_pil_image(ltImage) for ltImage in contents] 36 37 38def travarse(layout): 39 images = [] 40 41 for obj in layout: 42 if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine) or isinstance(obj, LTFigure): 43 images.extend(travarse(obj)) 44 45 elif isinstance(obj, LTImage): 46 images.append(obj) 47 48 return images 49 50def to_pil_image(ltImage): 51 buffer = StringIO.StringIO() 52 buffer.write(ltImage.stream.get_rawdata()) 53 buffer.seek(0) 54 return Image.open(buffer) 55 56
python
1#util.py 2# -*- coding: utf-8 -*- 3 4from __future__ import print_function 5 6from pdfminer.pdfparser import PDFParser 7from pdfminer.pdfdocument import PDFDocument 8from pdfminer.pdfdocument import PDFTextExtractionNotAllowed 9 10# import cv2 11# import numpy as np 12from PIL import Image 13 14 15 16def load_pdf(filename, password=None): 17 """Open a PDF file.""" 18 fp = open(filename, 'rb') 19 20 # Create a PDF parser object associated with the file object. 21 parser = PDFParser(fp) 22 23 # Create a PDF document object that stores the document structure. 24 # Supply the password for initialization. 25 document = PDFDocument(parser, password) 26 27 # Check if the document allows text extraction. If not, abort. 28 if not document.is_extractable: 29 raise PDFTextExtractionNotAllowed 30 31 return document 32 33 34def crop(image, ratio_box): 35 width, height = image.size 36 left, upper, right, lower = ratio_box 37 38 return image.crop(( 39 left * width, 40 upper * height, 41 right * width, 42 lower * height, 43 )) 44
試したこと
VS CODEを使用してPYTHON 2.7.15で上記を実行しようとしました
補足情報(FW/ツールのバージョンなど)
WIN 10
ライブラリ:
future==0.16.0
pdfminer==20140328
pefile==2016.3.28
Pillow==3.4.2
PyInstaller==3.2
pyocr==0.4.2
pypiwin32==219
six==1.10.0
OCR:
Tesseract
あなたの回答
tips
プレビュー