Added two samples for "OCR with PDF/TIFF as source files" (GoogleCloudPlatform#2034)

happyhuman · web-flow · commit c2d8a9ee2b03 · 2019-03-12T15:42:49.000-07:00
* Added two samples for "OCR with PDF/TIFF as source files"

* Moved the code to beta_snippets.py

* Fixed the sub-parser names.

* Shortened the line that was too long.

* Added newline at the end of the file

* Using the builtin open function instead

* Renamed a variable

* Fixed the wrong arg parameter

* Added extra comment lines

* Regenerated README.rst

* Added specific strings to be unit-tested
diff --git a/vision/cloud-client/detect/README.rst b/vision/cloud-client/detect/README.rst
@@ -165,7 +165,7 @@ To run this sample:
     $ python beta_snippets.py
 
     usage: beta_snippets.py [-h]
-                            {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri}
+                            {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri,doc-features,doc-features-uri}
                             ...
 
     Google Cloud Vision API Python Beta Snippets
@@ -176,12 +176,14 @@ To run this sample:
     python beta_snippets.py object-localization-uri gs://...
     python beta_snippets.py handwritten-ocr INPUT_IMAGE
     python beta_snippets.py handwritten-ocr-uri gs://...
+    python beta_snippets.py doc-features INPUT_PDF
+    python beta_snippets.py doc-features_uri gs://...
 
     For more information, the documentation at
     https://cloud.google.com/vision/docs.
 
     positional arguments:
-      {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri}
+      {object-localization,object-localization-uri,handwritten-ocr,handwritten-ocr-uri,doc-features,doc-features-uri}
         object-localization
                             Localize objects in the local image. Args: path: The
                             path to the local file.
@@ -195,6 +197,14 @@ To run this sample:
                             Detects handwritten characters in the file located in
                             Google Cloud Storage. Args: uri: The path to the file
                             in Google Cloud Storage (gs://...)
+        doc-features        Detects document features in a PDF/TIFF/GIF file.
+                            While your PDF file may have several pages, this API
+                            can process up to 5 pages only. Args: path: The path
+                            to the local file.
+        doc-features-uri    Detects document features in a PDF/TIFF/GIF file.
+                            While your PDF file may have several pages, this API
+                            can process up to 5 pages only. Args: uri: The path to
+                            the file in Google Cloud Storage (gs://...)
 
     optional arguments:
       -h, --help            show this help message and exit
diff --git a/vision/cloud-client/detect/beta_snippets.py b/vision/cloud-client/detect/beta_snippets.py
@@ -23,6 +23,8 @@
 python beta_snippets.py object-localization-uri gs://...
 python beta_snippets.py handwritten-ocr INPUT_IMAGE
 python beta_snippets.py handwritten-ocr-uri gs://...
+python beta_snippets.py doc-features INPUT_PDF
+python beta_snippets.py doc-features_uri gs://...
 
 
 For more information, the documentation at
@@ -174,6 +176,105 @@ def detect_handwritten_ocr_uri(uri):
 # [END vision_handwritten_ocr_gcs_beta]
 
 
+# [START vision_fulltext_detection_pdf_beta]
+def detect_document_features(path):
+    """Detects document features in a PDF/TIFF/GIF file.
+
+    While your PDF file may have several pages,
+    this API can process up to 5 pages only.
+
+    Args:
+    path: The path to the local file.
+    """
+    from google.cloud import vision_v1p4beta1 as vision
+    client = vision.ImageAnnotatorClient()
+
+    with open(path, 'rb') as pdf_file:
+        content = pdf_file.read()
+
+    # Other supported mime_types: image/tiff' or 'image/gif'
+    mime_type = 'application/pdf'
+    input_config = vision.types.InputConfig(
+        content=content, mime_type=mime_type)
+
+    feature = vision.types.Feature(
+        type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
+    # Annotate the first two pages and the last one (max 5 pages)
+    # First page starts at 1, and not 0. Last page is -1.
+    pages = [1, 2, -1]
+
+    request = vision.types.AnnotateFileRequest(
+        input_config=input_config,
+        features=[feature],
+        pages=pages)
+
+    response = client.batch_annotate_files(requests=[request])
+
+    for image_response in response.responses[0].responses:
+        for page in image_response.full_text_annotation.pages:
+            for block in page.blocks:
+                print('\nBlock confidence: {}\n'.format(block.confidence))
+                for par in block.paragraphs:
+                    print('\tParagraph confidence: {}'.format(par.confidence))
+                    for word in par.words:
+                        symbol_texts = [symbol.text for symbol in word.symbols]
+                        word_text = ''.join(symbol_texts)
+                        print('\t\tWord text: {} (confidence: {})'.format(
+                            word_text, word.confidence))
+                        for symbol in word.symbols:
+                            print('\t\t\tSymbol: {} (confidence: {})'.format(
+                                symbol.text, symbol.confidence))
+# [END vision_fulltext_detection_pdf_beta]
+
+
+# [START vision_fulltext_detection_pdf_gcs_beta]
+def detect_document_features_uri(gcs_uri):
+    """Detects document features in a PDF/TIFF/GIF  file.
+
+    While your PDF file may have several pages,
+    this API can process up to 5 pages only.
+
+    Args:
+    uri: The path to the file in Google Cloud Storage (gs://...)
+    """
+    from google.cloud import vision_v1p4beta1 as vision
+    client = vision.ImageAnnotatorClient()
+
+    # Other supported mime_types: image/tiff' or 'image/gif'
+    mime_type = 'application/pdf'
+    input_config = vision.types.InputConfig(
+        gcs_source=vision.types.GcsSource(uri=gcs_uri), mime_type=mime_type)
+
+    feature = vision.types.Feature(
+        type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
+    # Annotate the first two pages and the last one (max 5 pages)
+    # First page starts at 1, and not 0. Last page is -1.
+    pages = [1, 2, -1]
+
+    request = vision.types.AnnotateFileRequest(
+        input_config=input_config,
+        features=[feature],
+        pages=pages)
+
+    response = client.batch_annotate_files(requests=[request])
+
+    for image_response in response.responses[0].responses:
+        for page in image_response.full_text_annotation.pages:
+            for block in page.blocks:
+                print('\nBlock confidence: {}\n'.format(block.confidence))
+                for par in block.paragraphs:
+                    print('\tParagraph confidence: {}'.format(par.confidence))
+                    for word in par.words:
+                        symbol_texts = [symbol.text for symbol in word.symbols]
+                        word_text = ''.join(symbol_texts)
+                        print('\t\tWord text: {} (confidence: {})'.format(
+                            word_text, word.confidence))
+                        for symbol in word.symbols:
+                            print('\t\t\tSymbol: {} (confidence: {})'.format(
+                                symbol.text, symbol.confidence))
+# [END vision_fulltext_detection_pdf_gcs_beta]
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description=__doc__,
@@ -196,15 +297,27 @@ def detect_handwritten_ocr_uri(uri):
         'handwritten-ocr-uri', help=detect_handwritten_ocr_uri.__doc__)
     handwritten_uri_parser.add_argument('uri')
 
+    doc_features_parser = subparsers.add_parser(
+        'doc-features', help=detect_document_features.__doc__)
+    doc_features_parser.add_argument('path')
+
+    doc_features_uri_parser = subparsers.add_parser(
+        'doc-features-uri', help=detect_document_features_uri.__doc__)
+    doc_features_uri_parser.add_argument('uri')
+
     args = parser.parse_args()
 
     if 'uri' in args.command:
         if 'object-localization-uri' in args.command:
             localize_objects_uri(args.uri)
         elif 'handwritten-ocr-uri' in args.command:
             detect_handwritten_ocr_uri(args.uri)
+        elif 'doc-features' in args.command:
+            detect_handwritten_ocr_uri(args.uri)
     else:
         if 'object-localization' in args.command:
             localize_objects(args.path)
         elif 'handwritten-ocr' in args.command:
             detect_handwritten_ocr(args.path)
+        elif 'doc-features' in args.command:
+            detect_handwritten_ocr(args.path)
diff --git a/vision/cloud-client/detect/beta_snippets_test.py b/vision/cloud-client/detect/beta_snippets_test.py
@@ -16,6 +16,7 @@
 import beta_snippets
 
 RESOURCES = os.path.join(os.path.dirname(__file__), 'resources')
+GCS_ROOT = 'gs://cloud-samples-data/vision/'
 
 
 def test_localize_objects(capsys):
@@ -28,7 +29,7 @@ def test_localize_objects(capsys):
 
 
 def test_localize_objects_uri(capsys):
-    uri = 'gs://cloud-samples-data/vision/puppies.jpg'
+    uri = GCS_ROOT + 'puppies.jpg'
 
     beta_snippets.localize_objects_uri(uri)
 
@@ -46,9 +47,25 @@ def test_handwritten_ocr(capsys):
 
 
 def test_handwritten_ocr_uri(capsys):
-    uri = 'gs://cloud-samples-data/vision/handwritten.jpg'
+    uri = GCS_ROOT + 'handwritten.jpg'
 
     beta_snippets.detect_handwritten_ocr_uri(uri)
 
     out, _ = capsys.readouterr()
     assert 'Cloud Vision API' in out
+
+
+def test_detect_pdf_document(capsys):
+    file_name = os.path.join(RESOURCES, 'kafka.pdf')
+    beta_snippets.detect_document_features(file_name)
+    out, _ = capsys.readouterr()
+    assert 'Symbol: a' in out
+    assert 'Word text: evenings' in out
+
+
+def test_detect_pdf_document_from_gcs(capsys):
+    gcs_uri = GCS_ROOT + 'document_understanding/kafka.pdf'
+    beta_snippets.detect_document_features_uri(gcs_uri)
+    out, _ = capsys.readouterr()
+    assert 'Symbol' in out
+    assert 'Word text' in out
diff --git a/vision/cloud-client/detect/resources/kafka.pdf b/vision/cloud-client/detect/resources/kafka.pdf