質問編集履歴
2
コードの記載
test
CHANGED
File without changes
|
test
CHANGED
@@ -9,6 +9,152 @@
|
|
9
9
|
|
10
10
|
|
11
11
|
https://cloud.google.com/vision/docs/pdf#vision_text_detection_pdf_gcs-python
|
12
|
+
|
13
|
+
```python
|
14
|
+
|
15
|
+
def async_detect_document(gcs_source_uri, gcs_destination_uri):
|
16
|
+
|
17
|
+
"""OCR with PDF/TIFF as source files on GCS"""
|
18
|
+
|
19
|
+
import json
|
20
|
+
|
21
|
+
import re
|
22
|
+
|
23
|
+
from google.cloud import vision
|
24
|
+
|
25
|
+
from google.cloud import storage
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
# Supported mime_types are: 'application/pdf' and 'image/tiff'
|
30
|
+
|
31
|
+
mime_type = 'application/pdf'
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
# How many pages should be grouped into each json output file.
|
36
|
+
|
37
|
+
batch_size = 2
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
client = vision.ImageAnnotatorClient()
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
feature = vision.Feature(
|
46
|
+
|
47
|
+
type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
gcs_source = vision.GcsSource(uri=gcs_source_uri)
|
52
|
+
|
53
|
+
input_config = vision.InputConfig(
|
54
|
+
|
55
|
+
gcs_source=gcs_source, mime_type=mime_type)
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
|
60
|
+
|
61
|
+
output_config = vision.OutputConfig(
|
62
|
+
|
63
|
+
gcs_destination=gcs_destination, batch_size=batch_size)
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
async_request = vision.AsyncAnnotateFileRequest(
|
68
|
+
|
69
|
+
features=[feature], input_config=input_config,
|
70
|
+
|
71
|
+
output_config=output_config)
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
operation = client.async_batch_annotate_files(
|
76
|
+
|
77
|
+
requests=[async_request])
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
print('Waiting for the operation to finish.')
|
82
|
+
|
83
|
+
operation.result(timeout=420)
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
# Once the request has completed and the output has been
|
88
|
+
|
89
|
+
# written to GCS, we can list all the output files.
|
90
|
+
|
91
|
+
storage_client = storage.Client()
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
|
96
|
+
|
97
|
+
bucket_name = match.group(1)←ここでエラーになります。
|
98
|
+
|
99
|
+
prefix = match.group(2)
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
bucket = storage_client.get_bucket(bucket_name)
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
# List objects with the given prefix.
|
108
|
+
|
109
|
+
blob_list = list(bucket.list_blobs(prefix=prefix))
|
110
|
+
|
111
|
+
print('Output files:')
|
112
|
+
|
113
|
+
for blob in blob_list:
|
114
|
+
|
115
|
+
print(blob.name)
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
# Process the first output file from GCS.
|
120
|
+
|
121
|
+
# Since we specified batch_size=2, the first response contains
|
122
|
+
|
123
|
+
# the first two pages of the input file.
|
124
|
+
|
125
|
+
output = blob_list[0]
|
126
|
+
|
127
|
+
|
128
|
+
|
129
|
+
json_string = output.download_as_string()
|
130
|
+
|
131
|
+
response = json.loads(json_string)
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
# The actual response for the first page of the input file.
|
136
|
+
|
137
|
+
first_page_response = response['responses'][0]
|
138
|
+
|
139
|
+
annotation = first_page_response['fullTextAnnotation']
|
140
|
+
|
141
|
+
|
142
|
+
|
143
|
+
# Here we print the full text from the first page.
|
144
|
+
|
145
|
+
# The response contains more information:
|
146
|
+
|
147
|
+
# annotation/pages/blocks/paragraphs/words/symbols
|
148
|
+
|
149
|
+
# including confidence scores and bounding boxes
|
150
|
+
|
151
|
+
print('Full text:\n')
|
152
|
+
|
153
|
+
print(annotation['text'])
|
154
|
+
|
155
|
+
```
|
156
|
+
|
157
|
+
|
12
158
|
|
13
159
|
(エラー画面)
|
14
160
|
|
1
タグの追加
test
CHANGED
File without changes
|
test
CHANGED
File without changes
|