docs(samples): Updated Doc AI Quickstart sample based on user feedback (GoogleCloudPlatform#10421)

holtskinner · web-flow · commit e2d7865695dd · 2023-07-17T15:13:25.000+02:00
* docs(samples): Updated Doc AI Quickstart sample based on user feedback

* fix: Add Specific Version to Form Parser tests and removed number of form fields extracted

* fix: Update Form Parser response test to read return values instead of print
diff --git a/documentai/snippets/handle_response_sample.py b/documentai/snippets/handle_response_sample.py
@@ -142,7 +142,7 @@ def process_document_form_sample(
     processor_version: str,
     file_path: str,
     mime_type: str,
-) -> None:
+) -> documentai.Document:
     # Online processing request to Document AI
     document = process_document(
         project_id, location, processor_id, processor_version, file_path, mime_type
@@ -162,9 +162,9 @@ def process_document_form_sample(
 
         print(f"\nFound {len(page.tables)} table(s):")
         for table in page.tables:
-            num_collumns = len(table.header_rows[0].cells)
+            num_columns = len(table.header_rows[0].cells)
             num_rows = len(table.body_rows)
-            print(f"Table with {num_collumns} columns and {num_rows} rows:")
+            print(f"Table with {num_columns} columns and {num_rows} rows:")
 
             # Print header rows
             print("Columns:")
@@ -179,6 +179,8 @@ def process_document_form_sample(
             value = layout_to_text(field.field_value, text)
             print(f"    * {repr(name.strip())}: {repr(value.strip())}")
 
+    return document
+
 
 def print_table_rows(
     table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
diff --git a/documentai/snippets/handle_response_sample_test.py b/documentai/snippets/handle_response_sample_test.py
@@ -43,32 +43,26 @@ def test_process_document_ocr(capsys):
     assert "FakeDoc" in out
 
 
-def test_process_document_form(capsys):
+def test_process_document_form():
     location = "us"
     project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
     processor_id = "90484cfdedb024f6"
-    processor_version = "stable"
+    processor_version = "pretrained-form-parser-v2.0-2022-11-10"
     file_path = "resources/invoice.pdf"
     mime_type = "application/pdf"
 
-    handle_response_sample.process_document_form_sample(
+    document = handle_response_sample.process_document_form_sample(
         project_id=project_id,
         location=location,
         processor_id=processor_id,
         processor_version=processor_version,
         file_path=file_path,
         mime_type=mime_type,
     )
-    out, _ = capsys.readouterr()
 
-    expected_strings = [
-        "There are 1 page(s) in this document.",
-        "Table with 4 columns and 6 rows",
-        "Found 13 form field(s)",
-        "'BALANCE DUE': '$2140.00'",
-    ]
-    for expected_string in expected_strings:
-        assert expected_string in out
+    assert len(document.pages) == 1
+    assert len(document.pages[0].tables[0].header_rows[0].cells) == 4
+    assert len(document.pages[0].tables[0].body_rows) == 6
 
 
 def test_process_document_quality(capsys):
diff --git a/documentai/snippets/quickstart_sample.py b/documentai/snippets/quickstart_sample.py
@@ -23,19 +23,15 @@
 # TODO(developer): Uncomment these variables before running the sample.
 # project_id = "YOUR_PROJECT_ID"
 # location = "YOUR_PROCESSOR_LOCATION"  # Format is "us" or "eu"
-# processor_display_name = "YOUR_PROCESSOR_DISPLAY_NAME" # Must be unique per project, e.g.: "My Processor"
-# processor_type = "YOUR_PROCESSOR_TYPE" # Use `fetch_processor_types()` to get available processor types
 # file_path = "/path/to/local/pdf"
-# mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
+# processor_display_name = "YOUR_PROCESSOR_DISPLAY_NAME" # Must be unique per project, e.g.: "My Processor"
 
 
 def quickstart(
     project_id: str,
     location: str,
-    processor_display_name: str,
-    processor_type: str,
     file_path: str,
-    mime_type: str,
+    processor_display_name: str = "My Processor",
 ):
     # You must set the `api_endpoint`if you use a location other than "us".
     opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
@@ -50,7 +46,8 @@ def quickstart(
     processor = client.create_processor(
         parent=parent,
         processor=documentai.Processor(
-            display_name=processor_display_name, type_=processor_type
+            type_="OCR_PROCESSOR",  # Refer to https://cloud.google.com/document-ai/docs/create-processor for how to get available processor types
+            display_name=processor_display_name,
         ),
     )
 
@@ -62,7 +59,10 @@ def quickstart(
         image_content = image.read()
 
     # Load binary data
-    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
+    raw_document = documentai.RawDocument(
+        content=image_content,
+        mime_type="application/pdf",  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
+    )
 
     # Configure the process request
     # `processor.name` is the full resource name of the processor, e.g.:
diff --git a/documentai/snippets/quickstart_sample_test.py b/documentai/snippets/quickstart_sample_test.py
@@ -26,19 +26,15 @@
 location = "us"
 project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
 processor_display_name = f"test-processor-{uuid4()}"
-processor_type = "OCR_PROCESSOR"
 file_path = "resources/invoice.pdf"
-mime_type = "application/pdf"
 
 
 def test_quickstart(capsys):
     processor = quickstart_sample.quickstart(
         project_id=project_id,
         location=location,
         processor_display_name=processor_display_name,
-        processor_type=processor_type,
         file_path=file_path,
-        mime_type=mime_type,
     )
     out, _ = capsys.readouterr()