編集履歴

質問編集履歴

テンプレートを使用し、必要な情報を追加しました

2019/09/23 12:33

投稿

bing

スコア13

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -2,44 +2,576 @@
+### 前提・実現したいこと
-スキャンされた書類をpythonを使用して文字を読み取りたいと考えています
+スキャンされた書類(PDF)をpythonを使用して文字を読み取りたいと考えています
-https://blog.mudatobunka.org/entry/2016/11/19/134726
+[リンク内容](https://blog.mudatobunka.org/entry/2016/11/19/134726)
+自分のしたいことのほとんどがこちらのコードで実現できそうなので
-を実行させていただきましたが、
+勉強させていただこうと思いましたが、エラーが出てしまいます
+エラーの解決策はどのようになるのでしょうか
+### 発生している問題・エラーメッセージ
+```
+Traceback (most recent call last):
+  File "c:/Users/Master/Desktop/renamerobot-master/main.py", line 38, in <module>
+    images = extract_images(document)
+  File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 34, in extract_images
+    return [to_pil_image(ltImage) for ltImage in contents]
+  File "c:\Users\Master\Desktop\renamerobot-master\renamerobot\pdf.py", line 53, in to_pil_image
+    return Image.open(buffer)
+  File "C:\Python27\lib\site-packages\PIL\Image.py", line 2822, in open
+    raise IOError("cannot identify image file %r" % (filename if filename else fp))
+IOError: cannot identify image file <StringIO.StringIO instance at 0x000000000316F708>
+```
+### 該当のソースコード
 ```python
+#main.py
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+from glob import glob
+import re
+import os
+import shutil
+from datetime import datetime
+from renamerobot.util import load_pdf
+from renamerobot.pdf import extract_images
+from renamerobot.ocr import read_ordernum, read_date
+if not os.path.isdir('BEFORE'):
+    os.mkdir('BEFORE')
+if not os.path.isdir('AFTER'):
+    os.mkdir('AFTER')
+if not os.path.isdir(u'読み取り失敗'):
+    os.mkdir(u'読み取り失敗')
+pdfs = glob('BEFORE/*.pdf')
+pdfs_count = len(pdfs)
+# print(pdfs)
-File "C:\Python27\lib\site-packages\PIL\Image.py", line 2822, in open
+re_pdfname = re.compile(r'(\|/)(?P<name>.+)$')
+unrenamed = []
+for i, pdf in enumerate(pdfs):
+    print(u'{0}/{1} 読み取り中 ...'.format(i+1, pdfs_count))
+    document = load_pdf(pdf)
+    images = extract_images(document)
+    ordernum = read_ordernum(images)
+    date = read_date(images)
+    m = re_pdfname.search(pdf)
+    before = m and m.group('name')
+    print(u'リネーム前:', before)
+    if ordernum is None:
+        print(u'  !!受注番号の読み取りに失敗', end='\n\n')
+        unrenamed.append(before)
+        shutil.copy(pdf, u'読み取り失敗/')
+    elif date is None:
+        print(u'  !!日付の読み取りに失敗', end='\n\n')
+        unrenamed.append(before)
+        shutil.copy(pdf, u'読み取り失敗/')
+    else:
+        after = '{0[ordernum]}_{1[year]:0>4}{1[month]:0>2}{1[day]:0>2}.pdf'.format(ordernum, date)
+        print(u'リネーム後:', after, end='\n\n')
+        shutil.copyfile(pdf, 'AFTER/'+after)
+if 0 != len(unrenamed):
+    nowstr = datetime.now().strftime('%Y%m%d_%H%M%S')
-    raise IOError("cannot identify image file %r" % (filename if filename else fp))
+    with open(u'{1}_【{0}件のリネームできなかったファイル】.txt'.format(len(unrenamed), nowstr), 'w') as f:
-IOError: cannot identify image file <StringIO.StringIO instance at 0x000000000308D608>
+        f.write('\n'.join(unrenamed))
 ```
-がでて終了します
-恐らく
 ```python
+#ocr.py
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import sys
+import itertools
+import re
+import pyocr
+import pyocr.builders
+from renamerobot.util import crop
+from PIL import ImageOps
+tools = pyocr.get_available_tools()
+if len(tools) == 0:
+    print("No OCR tool found")
+    sys.exit(1)
+# The tools are returned in the recommended order of usage
+tool = tools[0]
+ORDERNUM_BOX = (
+    (0.8, 0.11, 0.95, 0.135),
+    (0.8, 0.08, 0.95, 0.105),
+)
+re_ordernum = re.compile(r'(?P<num>\d{4,})$', re.MULTILINE)
+REPLACE_PAIR_1 = (
+    (u']', u'1'),
+    (u'}', u'1'),
+    (u'ー', u'1'),
+    (u'Z', u'2'),
+    (u'O', u'0'),
+    (u'〇', u'0'),
+    (u'I', u'1'),
+    (u'l', u'1'),
+)
+def read_ordernum(images):
+    for image, box in itertools.product(images, ORDERNUM_BOX):
+        image = crop(image, box)
+        image = ImageOps.grayscale(image)
+        # image = resize(image, height=80)
+        # image = erode(image)
+        txt = tool.image_to_string(
+            image,
+            lang='eng',
+            builder=pyocr.builders.TextBuilder(tesseract_layout=7)
+        )
+        for before, after in REPLACE_PAIR_1:
+            txt = txt.replace(before, after)
+        txt = re.sub(r'\s+', '', txt)
+        # try:
+        #     print('OCR:')
+        #     print(txt)
+        # except Exception as e:
+        #     print(e)
+        result = re_ordernum.search(txt)
+        if result is not None:
+            return {
+                'ordernum': result.group('num'),
+            }
+    return None
+DATE_BOX = (
+    (0.8, 0.078, 0.95, 0.1),
+    (0.8, 0.06, 0.95, 0.08),
+)
+re_date = re.compile(u'(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日', re.MULTILINE)
+REPLACE_PAIR_2 = (
+    (u']', u'1'),
+    (u'}', u'1'),
+    (u'ー', u'1'),
+    (u'仔', u'年'),
+    (u'El', u'日'),
+    (u'E|', u'日'),
+    (u'E', u'日'),
+    (u'□', u'日'),
+    (u'口', u'日'),
+    (u'曰', u'日'),
+    (u'Z', u'2'),
+    (u'O', u'0'),
+    (u'〇', u'0'),
+    (u'I', u'1'),
+    (u'l', u'1'),
+)
+def read_date(images):
+    for image, box in itertools.product(images, DATE_BOX):
+        image = crop(image, box)
+        image = ImageOps.grayscale(image)
+        # image = resize(image, height=80)
+        # image = erode(image)
+        txt = tool.image_to_string(
+            image,
+            lang='jpn+eng',
+            builder=pyocr.builders.TextBuilder(tesseract_layout=6)
+        )
+        for before, after in REPLACE_PAIR_2:
+            txt = txt.replace(before, after)
+        txt = re.sub(r'\s+', '', txt)
+        # try:
+        #     print('OCR:')
+        #     print(txt)
+        # except Exception as e:
+        #     print(e)
+        result = re_date.search(txt)
+        if result is not None:
+            return {
+                'year': result.group('year'),
+                'month': result.group('month'),
+                'day': result.group('day'),
+            }
+    return None
+```
+```python
+#pdf.py
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import StringIO
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTImage,  LTFigure
+from pdfminer.converter import PDFPageAggregator
+from PIL import Image
+def extract_images(document):
+    # Create a PDF resource manager object that stores shared resources.
+    rsrcmgr = PDFResourceManager()
+    # Create a PDF device object.
+    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
+    # Create a PDF interpreter object.
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    contents = []
+    for page in PDFPage.create_pages(document):
+        interpreter.process_page(page)
+        layout = device.get_result()
+        # print(layout)
+        contents.extend(travarse(layout))
+    return [to_pil_image(ltImage) for ltImage in contents]
+def travarse(layout):
+    images = []
+    for obj in layout:
+        if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine) or isinstance(obj, LTFigure):
+            images.extend(travarse(obj))
+        elif isinstance(obj, LTImage):
+            images.append(obj)
+    return images
 def to_pil_image(ltImage):
-    """Raw Binary を Image オブジェクトに変換"""
     buffer = StringIO.StringIO()
     buffer.write(ltImage.stream.get_rawdata())
@@ -48,26 +580,138 @@
     return Image.open(buffer)
 ```
-の部分に問題があると思うのですが、pythonを初めて使用するため改善の方法がわかりません
-アドバイスをお願いします
-環境；
-win10
-python 2.7.15
-vs code
-よろしくお願いします
+```python
+#util.py
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfdocument import  PDFTextExtractionNotAllowed
+# import cv2
+# import numpy as np
+from PIL import Image
+def load_pdf(filename, password=None):
+    """Open a PDF file."""
+    fp = open(filename, 'rb')
+    # Create a PDF parser object associated with the file object.
+    parser = PDFParser(fp)
+    # Create a PDF document object that stores the document structure.
+    # Supply the password for initialization.
+    document = PDFDocument(parser, password)
+    # Check if the document allows text extraction. If not, abort.
+    if not document.is_extractable:
+        raise PDFTextExtractionNotAllowed
+    return document
+def crop(image, ratio_box):
+    width, height = image.size
+    left, upper, right, lower = ratio_box
+    return image.crop((
+        left * width,
+        upper * height,
+        right * width,
+        lower * height,
+    ))
+```
+### 試したこと
+VS CODEを使用してPYTHON 2.7.15で上記を実行しようとしました
+### 補足情報（FW/ツールのバージョンなど）
+WIN 10
+ライブラリ：
+future==0.16.0
+pdfminer==20140328
+pefile==2016.3.28
+Pillow==3.4.2
+PyInstaller==3.2
+pyocr==0.4.2
+pypiwin32==219
+six==1.10.0
+OCR:
+Tesseract

誤字がありましたので訂正しました

2019/09/23 12:33

投稿

bing

スコア13

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -52,7 +52,7 @@
-の部分が問題がると思うのですが、pythonを初めて使用するため改善の方法がわかりません
+の部分に問題があると思うのですが、pythonを初めて使用するため改善の方法がわかりません
@@ -62,6 +62,8 @@
 環境；
+win10
 python 2.7.15
 vs code