pythonで画像ﾌｧｲﾙをﾃｷｽﾄ抽出したい

画像PDFﾌｧｲﾙをpythonでjpegに変換しocrをつかってそこからﾃｷｽﾄを抽出しました
しかし文字化けが多いため、いきなりPDFというソフトでﾃｷｽﾄ抽出しましたが文字化けするのでいきなりPDFのソフトでPDF⇒jpeg変換したものをpythonでﾃｷｽﾄ抽出しようとしました
いきなりpdfでjpeg変化するとjpegﾌｧｲﾙが1枚ずつになっていて横向きに画像保存されます
それを１枚１枚縦向きにし読み込んでﾃｷｽﾄ抽出してﾃｷｽﾄ追加しようとおもっているのですが
元のｺｰﾄﾞから修正しましたがうまくいきません

元ソース(pdf⇒jpeg⇒テキスト)

python
1import os
2from PIL import Image
3from pathlib import Path
4from pdf2image import convert_from_path
5
6import glob
7import pyocr
8import pyocr.builders
9import cv2
10from PIL import Image
11import sys
12 
13# インストール済みのTesseractのパスを通す
14path_tesseract =  r";C:\Program Files\Tesseract-OCR"
15if path_tesseract not in os.environ["PATH"].split(os.pathsep):
16    os.environ["PATH"] += path_tesseract
17    tools = pyocr.get_available_tools()
18    tool = tools[0]
19
20# PDFファイルのパス
21    pdf_path = glob.glob(r"C:\temp\PdfToImage\pdf_file\*.pdf")
22
23# PDFファイルのパスを取得し順番に捌いていく
24    for x in pdf_path:
25        pdf_path = Path(x)
26    print("PDFを画像ファイルに変換処理中です")    
27# PDF -> Image に変換（150dpi）
28    pages = convert_from_path(str(pdf_path), dpi=150)
29    
30# 画像ファイルを１ページずつ保存
31image_dir = r"C:\temp\PdfToImage\image_file"
32
33files = []
34for i, page in enumerate(pages):
35        img_rgb = page.convert("RGB")
36        pixels = img_rgb.load()
37
38  # 原稿画像加工（黒っぽい色以外は白=255,255,255にする）
39        c_max = 168
40        for j in range(img_rgb.size[1]):
41            for k in range(img_rgb.size[0]):
42                if (pixels[k, j][0] > c_max and pixels[k, j][1] > c_max and pixels[k, j][2] > c_max):
43                    img_rgb.putpixel((k,j), (255, 255, 255))
44                file_name = f'{pdf_path.stem}_{i + 1}.jpeg'
45
46        files.append(fr'{image_dir}/{file_name}')
47  # jpegで保存
48        img_rgb.save(str(files[i]), "jpeg")
49
50path = r"C:\temp\PdfToImage\image_file"
51files= glob.glob(fr'{path}/*jpeg')
52builder = pyocr.builders.TextBuilder(tesseract_layout=6)
53
54text_path = r"C:\temp\PdfToImage\text_file"
55
56for f in files:
57 
58    img = Image.open(f)
59
60    fp = open(text_path+r'\TextOut.txt', 'a', encoding='utf-8')
61    text_path = r"C:\temp\PdfToImage\text_file"
62    builder = pyocr.builders.TextBuilder(tesseract_layout=6)   
63    
64    #text = tool.image_to_string(img, lang="jpn", builder=builder)
65    if f.endswith('_1.jpeg'):
66        text1 = tool.image_to_string(img.crop((121,368,411,407)), lang="jpn+eng", builder=builder)
67        text2 = tool.image_to_string(img.crop((132,544,386,656)), lang="jpn+eng", builder=builder)
68        text3 = tool.image_to_string(img.crop((138,863,389,893)), lang="jpn+eng", builder=builder)
69        text4 = tool.image_to_string(img.crop((139,970,396,1006)), lang="jpn+eng", builder=builder)
70        text5 = tool.image_to_string(img.crop((138,1007,402,1107)), lang="jpn", builder=builder)
71        text6 = tool.image_to_string(img.crop((133,1175,400,1237)), lang="jpn+eng", builder=builder)
72    
73        text7 = tool.image_to_string(img.crop((440,373, 1150,408)), lang="jpn+eng", builder=builder)
74        text8 = tool.image_to_string(img.crop((440,412, 1150,438)), lang="jpn+eng", builder=builder)
75        text9 = tool.image_to_string(img.crop((440,515, 1150,806)), lang="jpn+eng", builder=builder)
76        text10 = tool.image_to_string(img.crop((440,849, 1150,933)), lang="jpn+eng", builder=builder)
77        text11 = tool.image_to_string(img.crop((440,977, 1150,1038)), lang="jpn+eng", builder=builder)
78        text12 = tool.image_to_string(img.crop((440,1082, 1150,1148)), lang="jpn+eng", builder=builder)
79        text13 = tool.image_to_string(img.crop((440,1193, 1150,1233)), lang="jpn+eng", builder=builder)      
80 
81    if f.endswith('_1.jpeg'):
82        text1 = tool.image_to_string(img.crop((128,377,416,410)), lang="jpn", builder=builder)
83        text2 = tool.image_to_string(img.crop((126,544,391,656)), lang="jpn", builder=builder)
84        text3 = tool.image_to_string(img.crop((133,861,389,893)), lang="jpn", builder=builder)
85        text4 = tool.image_to_string(img.crop((139,970,396,1006)), lang="jpn", builder=builder)
86        text5 = tool.image_to_string(img.crop((138,1078,402,1107)), lang="jpn", builder=builder)
87        text6 = tool.image_to_string(img.crop((138,1177,400,1235)), lang="jpn", builder=builder)
88    
89        text7 = tool.image_to_string(img.crop((440,412, 1150,438)), lang="jpn", builder=builder)
90        text8 = tool.image_to_string(img.crop((440,515, 1150,806)), lang="jpn", builder=builder)
91        text9 = tool.image_to_string(img.crop((440,849, 1150,933)), lang="jpn", builder=builder)
92        text10 = tool.image_to_string(img.crop((440,977, 1150,1038)), lang="jpn", builder=builder)
93        text11 = tool.image_to_string(img.crop((440,1082, 1150,1148)), lang="jpn", builder=builder)
94        text12 = tool.image_to_string(img.crop((440,1193, 635,1229)), lang="jpn", builder=builder)      
95 
96        text1 = text1.replace('　','')
97        text1 = text1.replace(' ','')
98        text1 = text1.replace('\n','')
99　　　　　　：
100 
101        text12 = text12.replace('　','')
102        text12 = text12.replace(' ','')
103        text12 = text12.replace('\n','')
104        
105        text1 += ';'
106        fp.write(text1)
107        text2 += ';'
108        fp.write(text2)
109        text3 += ';'
110        fp.write(text3)
111        text4 += ';'
112        fp.write(text4)
113        text5 += ';'
114        fp.write(text5)
115        text6 += '\n'
116        fp.write(text6)
117        text7 += ';'
118        fp.write(text7)
119        text8 += ';'
120        fp.write(text8)
121        text9+= ';'
122        fp.write(text9)
123        text10 += ';'
124        fp.write(text10)
125        text11 += ';'
126        fp.write(text11)
127        text12 += '\n'
128        fp.write(text12)
129
130        print(text1,text2,text3,text4,text5,text6)
131        print(text7,text8,text9,text10,text11,text12)
132    else:
133        text7 = tool.image_to_string(img.crop((440,412, 1150,438)), lang="jpn", builder=builder)
134        text8 = tool.image_to_string(img.crop((440,515, 1150,806)), lang="jpn", builder=builder)
135        text9 = tool.image_to_string(img.crop((440,849, 1150,933)), lang="jpn", builder=builder)
136        text10 = tool.image_to_string(img.crop((440,977, 1150,1038)), lang="jpn", builder=builder)
137        text11 = tool.image_to_string(img.crop((440,1082, 1150,1148)), lang="jpn", builder=builder)
138        text12 = tool.image_to_string(img.crop((440,1193, 635,1229)), lang="jpn", builder=builder)      
139      
140        text7 = text7.replace('　','')        
141        text7 = text7.replace(' ','')
142        text7 = text7.replace('\n','')
143       
144　　　　　　　：
145 
146        text12 = text12.replace('　','')
147        text12 = text12.replace(' ','')
148        text12 = text12.replace('\n','')
149       
150        text7 += ';'
151        fp.write(text7)
152        text8 += ';'
153        fp.write(text8)
154        text9+= ';'
155        fp.write(text9)
156        text10 += ';'
157        fp.write(text10)
158        text11 += ';'
159        fp.write(text11)
160        text12 += '\n'
161        fp.write(text12)
162
163        print(text7,text8,text9,text10,text11,text12)  
164                 
165    fp.close()
166
167print("処理を終了")

=========修正したコード(jpeg⇒テキスト)========

python
1import os
2from PIL import Image
3from pathlib import Path
4from pdf2image import convert_from_path
5
6import glob
7import pyocr
8import pyocr.builders
9import cv2
10from PIL import Image
11import sys
12 
13# インストール済みのTesseractのパスを通す
14path_tesseract =  r";C:\Program Files\Tesseract-OCR"
15if path_tesseract not in os.environ["PATH"].split(os.pathsep):
16    os.environ["PATH"] += path_tesseract
17    tools = pyocr.get_available_tools()
18    tool = tools[0]
19
20
21    print("画像ファイルを処理中です")
22    
23
24# 画像ファイルのパス
25    image_path = glob.glob(r"C:\temp\PdfToImage\image_file\*.jpg")
26    gazo = glob.glob(r"C:\temp\PdfToImage\gazo_file\*.jpg")
27
28files = []
29for i, page in enumerate(pages):
30        img_rgb = page.convert("RGB")
31        pixels = img_rgb.load()
32   
33    for i, file in enumerate(files):
34        img_rgb = files.convert("RGB")
35  　　pixels = img_rgb.load()
36
37  # 原稿画像加工（黒っぽい色以外は白=255,255,255にする）
38　　　c_max = 168
39　　　for j in range(image_path.size[1]):
40        　　　for k in range(image_path.size[0]):
41           　　　 if (pixels[k, j][0] > c_max and pixels[k, j][1] > c_max and pixels[k, j][2] > c_max):
42               　　　 image_pathimage_path.putpixel((k,j), (255, 255, 255))
43           　　　 file_name = f'{image_path.stem}_{i + 1}.jpg'
44
45        　　　files.append(fr'{image_dir}/{file_name}')
46  # jpegで保存
47        image_path.save(str(files[i]), "jpg")
48path = r"C:\temp\PdfToImage\image_file"
49files= glob.glob(r'{path}/*jpeg')
50builder = pyocr.builders.TextBuilder(tesseract_layout=6)
51
52text_path = r"C:\temp\PdfToImage\text_file"
53
54for f in files:
55 
56    img = (Image.open(f), cv2.ROTATE_90_COUNTERCLOCKWISE)
57
58    fp = open(text_path+r'\TextOut.txt', 'a', encoding='utf-8')
59    text_path = r"C:\temp\PdfToImage\text_file"
60    builder = pyocr.builders.TextBuilder(tesseract_layout=6)   
61    　　：
62
63以下元コードと同じ

エラーは以下のものです

AttributeError                            Traceback (most recent call last)
Cell In [1], line 40
     32     #for i, file in enumerate(files):
     33     #    img_rgb = files.convert("RGB")
     34     #   print("ここまでは動いている",files)
   (...)
     37     #   print(img_rgb)
     38   # 原稿画像加工（黒っぽい色以外は白=255,255,255にする）
     39 c_max = 168
---> 40 for j in range(image_path.size[1]):
     41         for k in range(image_path.size[0]):
     42             if (pixels[k, j][0] > c_max and pixels[k, j][1] > c_max and pixels[k, j][2] > c_max):

AttributeError: 'list' object has no attribute 'size'

image_file フォルダにあるjpgファイルをすべてrgbにコンバートしたい場合
for文でどのようにコードをおこせばいいでしょうか
よろしくお願いします

jbpb0

2022/12/15 00:05

pythonのコードの一番最初の行のすぐ上に ```python だけの行を追加してくださいまた、pythonのコードの一番最後の行のすぐ下に ``` だけの行を追加してくださいまたは、 https://teratail.storage.googleapis.com/uploads/contributed_images/56957fe805d9d7befa7dba6a98676d2b.gif を見て、そのようにしてみてください現状、コードがとても読み辛いです質問にコードを載せる際に上記をやってくれたら、他人がコードを読みやすくなり、コードの実行による現象確認もやりやすくなるので、回答されやすくなります