Skip to content
This repository was archived by the owner on Sep 20, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions samples/snippets/process_document_form_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# [START documentai_process_form_document]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# file_path = '/path/to/local/pdf'

def process_document_form_sample(
project_id: str, location: str, processor_id: str, file_path: str
):
from google.cloud import documentai_v1beta3 as documentai

# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}

client = documentai.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

with open(file_path, "rb") as image:
image_content = image.read()

# Read the file into memory
document = {"content": image_content, "mime_type": "application/pdf"}

# Configure the process request
request = {"name": name, "raw_document": document}

# Recognizes text entities in the PDF document
result = client.process_document(request=request)

print("Document processing complete.")

# Read the table and form fields output from the processor
# The form processor also contains OCR data. For more information
# on how to parse OCR data please see the OCR sample.
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
document = result.document
text = document.text
print(f"Full document text: {repr(text)}\n")
print(f"There are {len(document.pages)} page(s) in this document.")

# Read the text recognition output from the processor
for page in document.pages:
print(f"\n\n**** Page {page.page_number} ****")

print(f"Found {len(page.tables)} table(s):")
for table in page.tables:
num_collumns = len(table.header_rows[0].cells)
num_rows = len(table.body_rows)
print(f'Table with {num_collumns} columns and {num_rows} rows:')
print_table_info(table, text)
print(f'Found {len(page.form_fields)} form fields:')
for field in page.form_fields:
name = layout_to_text(field.field_name, text)
value = layout_to_text(field.field_value, text)
print(f" * {repr(name.strip())}: {repr(value.strip())}")


def print_table_info(table: dict, text: str) -> None:
# Print header row
header_row_text = ''
for header_cell in table.header_rows[0].cells:
header_cell_text = layout_to_text(header_cell.layout, text)
header_row_text += f'{repr(header_cell_text.strip())} | '
print(f'Collumns: {header_row_text[:-3]}')
# Print first body row
body_row_text = ''
for body_cell in table.body_rows[0].cells:
body_cell_text = layout_to_text(body_cell.layout, text)
body_row_text += f'{repr(body_cell_text.strip())} | '
print(f'First row data: {body_row_text[:-3]}\n')


def layout_to_text(layout: dict, text: str) -> str:
"""
Document AI identifies form fields by their offsets in the entirity of the
document's text. This function converts offsets to a string.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in layout.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in layout.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += text[start_index:end_index]
return response


# [END documentai_process_form_document]
43 changes: 43 additions & 0 deletions samples/snippets/process_document_form_sample_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# # Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

from samples.snippets import process_document_form_sample


location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
file_path = "resources/invoice.pdf"


def test_process_documents(capsys):
process_document_form_sample.process_document_form_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
)
out, _ = capsys.readouterr()

expected_strings = [
"There are 1 page(s) in this document.",
"Table with 4 columns and 6 rows",
"Found 13 form fields",
"'BALANCE DUE': '$2140.00'",
]
for expected_string in expected_strings:
assert expected_string in out
141 changes: 141 additions & 0 deletions samples/snippets/process_document_ocr_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# [START documentai_process_ocr_document]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# file_path = '/path/to/local/pdf'

def process_document_ocr_sample(
project_id: str, location: str, processor_id: str, file_path: str
) -> None:
from google.cloud import documentai_v1beta3 as documentai

# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}

client = documentai.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

with open(file_path, "rb") as image:
image_content = image.read()

# Read the file into memory
document = {"content": image_content, "mime_type": "application/pdf"}

# Configure the process request
request = {"name": name, "raw_document": document}

# Recognizes text entities in the PDF document
result = client.process_document(request=request)

print("Document processing complete.")

# Read the text recognition output from the processor
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
document = result.document
text = document.text
print(f"Full document text: {repr(text)}\n")
print(f"There are {len(document.pages)} page(s) in this document.\n")

for page in document.pages:
print(f"Page {page.page_number}:")
print_page_dimensions(page.dimension)
print_detected_langauges(page.detected_languages)
print_paragraphs(page.paragraphs, text)
print_blocks(page.blocks, text)
print_lines(page.lines, text)
print_tokens(page.tokens, text)


def print_page_dimensions(dimension: dict) -> None:
print(f" Width: {str(dimension.width)}")
print(f" Height: {str(dimension.height)}")


def print_detected_langauges(detected_languages: dict) -> None:
print(" Detected languages:")
for lang in detected_languages:
code = lang.language_code
conf_percent = '{:.1%}'.format(lang.confidence)
print(f" {code} ({conf_percent} confidence)")


def print_paragraphs(paragraphs: dict, text: str) -> None:
print(f" {len(paragraphs)} paragraphs detected:")
first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
print(f" First paragraph text: {repr(first_paragraph_text)}")
last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
print(f" Last paragraph text: {repr(last_paragraph_text)}")


def print_blocks(blocks: dict, text: str) -> None:
print(f" {len(blocks)} blocks detected:")
first_block_text = layout_to_text(blocks[0].layout, text)
print(f" First text block: {repr(first_block_text)}")
last_block_text = layout_to_text(blocks[-1].layout, text)
print(f" Last text block: {repr(last_block_text)}")


def print_lines(lines: dict, text: str) -> None:
print(f" {len(lines)} lines detected:")
first_line_text = layout_to_text(lines[0].layout, text)
print(f" First line text: {repr(first_line_text)}")
last_line_text = layout_to_text(lines[-1].layout, text)
print(f" Last line text: {repr(last_line_text)}")


def print_tokens(tokens: dict, text: str) -> None:
print(f" {len(tokens)} tokens detected:")
first_token_text = layout_to_text(tokens[0].layout, text)
first_token_break_type = tokens[0].detected_break.type_.name
print(f" First token text: {repr(first_token_text)}")
print(f" First token break type: {repr(first_token_break_type)}")
last_token_text = layout_to_text(tokens[-1].layout, text)
last_token_break_type = tokens[-1].detected_break.type_.name
print(f" Last token text: {repr(last_token_text)}")
print(f" Last token break type: {repr(last_token_break_type)}")


def layout_to_text(layout: dict, text: str) -> str:
"""
Document AI identifies text in different parts of the document by their
offsets in the entirity of the document's text. This function converts
offsets to a string.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in layout.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in layout.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += text[start_index:end_index]
return response


# [END documentai_process_ocr_document]
37 changes: 37 additions & 0 deletions samples/snippets/process_document_ocr_sample_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# # Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

from samples.snippets import process_document_ocr_sample

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "91e072f8626a76b7"
file_path = "resources/handwritten_form.pdf"


def test_process_documents(capsys):
process_document_ocr_sample.process_document_ocr_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
)
out, _ = capsys.readouterr()

assert "Page 1" in out
assert "en" in out
assert "FakeDoc" in out
Loading