Skip to content

Commit 81582c4

Browse files
autoerrholtskinnerdandhleekweinmeister
authored
Add Document AI as another BigQuery remote function example (GoogleCloudPlatform#9094)
* Add Document AI as another BigQuery remote function example * Add Document AI as another BigQuery remote function example * Apply suggestions from code review Co-authored-by: Dan Lee <71398022+dandhlee@users.noreply.github.com> * Review updates * Apply suggestions from code review --------- Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Co-authored-by: Dan Lee <71398022+dandhlee@users.noreply.github.com> Co-authored-by: Karl Weinmeister <11586922+kweinmeister@users.noreply.github.com>
1 parent 755e8bb commit 81582c4

6 files changed

Lines changed: 189 additions & 0 deletions

File tree

bigquery/remote-function/README.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,11 @@ Samples
6767
-------------------------------------------------------------------------------
6868

6969
- `Vision`_: this sample can detect and extract objects from input images.
70+
- `Document`_: this sample can extract text from input documents.
7071

7172

7273
.. _Vision: vision/
74+
.. _Document: document/
7375

7476
The client library
7577
-------------------------------------------------------------------------------
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START bigquery_remote_function_document]
16+
import urllib.request
17+
18+
import flask
19+
import functions_framework
20+
from google.api_core.client_options import ClientOptions
21+
from google.cloud import documentai
22+
23+
_PROJECT_ID = "YOUR_PROJECT_ID"
24+
_LOCATION = "us" # Change to "eu"
25+
_PROCESSOR_ID = "YOUR_PROCESSOR_ID"
26+
27+
28+
@functions_framework.http
29+
def document_ocr(request: flask.Request) -> flask.Response:
30+
"""BigQuery remote function to process document using Document AI OCR.
31+
32+
For complete Document AI use cases:
33+
https://cloud.google.com/document-ai/docs/samples/documentai-process-ocr-document
34+
35+
Args:
36+
request: HTTP request from BigQuery
37+
https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#input_format
38+
39+
Returns:
40+
HTTP response to BigQuery
41+
https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#output_format
42+
"""
43+
try:
44+
client = documentai.DocumentProcessorServiceClient(
45+
client_options=ClientOptions(
46+
api_endpoint=f"{_LOCATION}-documentai.googleapis.com"))
47+
processor_name = client.processor_path(
48+
_PROJECT_ID, _LOCATION, _PROCESSOR_ID)
49+
calls = request.get_json()['calls']
50+
replies = []
51+
for call in calls:
52+
content = urllib.request.urlopen(call[0]).read()
53+
content_type = call[1]
54+
results = client.process_document(
55+
{'name': processor_name, 'raw_document': {
56+
'content': content, 'mime_type': content_type}})
57+
replies.append({'text': results.document.text})
58+
return flask.make_response(flask.jsonify({'replies': replies}))
59+
except Exception as e: # Check error message if GoogleAPIException
60+
return flask.make_response(flask.jsonify({'errorMessage': str(e)}), 400)
61+
# [END bigquery_remote_function_document]
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from unittest import mock
16+
17+
import flask
18+
from google.cloud import documentai
19+
import pytest
20+
21+
import document_function
22+
23+
_BIGQUERY_REQUEST_JSON = {
24+
'calls':
25+
[
26+
['https://storage.googleapis.com/bucket/apple',
27+
'application/pdf'],
28+
['https://storage.googleapis.com/bucket/banana',
29+
'application/pdf'],
30+
]
31+
}
32+
_BIGQUERY_RESPONSE_JSON = {
33+
'replies':
34+
[
35+
{'text': 'apple'},
36+
{'text': 'banana'},
37+
]
38+
}
39+
40+
41+
# Create a fake "app" for generating test request contexts.
42+
@pytest.fixture(scope="module")
43+
def app() -> flask.Flask:
44+
return flask.Flask(__name__)
45+
46+
47+
@mock.patch('document_function.urllib.request')
48+
@mock.patch('document_function.documentai')
49+
def test_document_function(
50+
mock_documentai: object,
51+
mock_request: object,
52+
app: flask.Flask,
53+
) -> None:
54+
mock_request.urlopen = mock.Mock(read=mock.Mock(return_value=b'filedata'))
55+
process_document_mock = mock.Mock(side_effect=[
56+
documentai.ProcessResponse(
57+
{'document': {'text': 'apple'}}),
58+
documentai.ProcessResponse(
59+
{'document': {'text': 'banana'}})])
60+
mock_documentai.DocumentProcessorServiceClient = mock.Mock(
61+
return_value=mock.Mock(process_document=process_document_mock))
62+
with app.test_request_context(json=_BIGQUERY_REQUEST_JSON):
63+
response = document_function.document_ocr(flask.request)
64+
assert response.status_code == 200
65+
assert response.get_json() == _BIGQUERY_RESPONSE_JSON
66+
67+
68+
@mock.patch('document_function.urllib.request')
69+
@mock.patch('document_function.documentai')
70+
def test_document_function_error(
71+
mock_documentai: object,
72+
mock_request: object,
73+
app: flask.Flask,
74+
) -> None:
75+
mock_request.urlopen = mock.Mock(read=mock.Mock(return_value=b'filedata'))
76+
process_document_mock = mock.Mock(side_effect=Exception('API error'))
77+
mock_documentai.DocumentProcessorServiceClient = mock.Mock(
78+
return_value=mock.Mock(process_document=process_document_mock))
79+
with app.test_request_context(json=_BIGQUERY_REQUEST_JSON):
80+
response = document_function.document_ocr(flask.request)
81+
assert response.status_code == 400
82+
assert 'API error' in str(response.get_data())
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# TEST_CONFIG_OVERRIDE copied from the source of truth:
16+
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/main/noxfile_config.py
17+
18+
TEST_CONFIG_OVERRIDE = {
19+
# You can opt out from the test for specific Python versions.
20+
"ignored_versions": ["2.7", "3.6"],
21+
# Old samples are opted out of enforcing Python type hints
22+
# All new samples should feature them
23+
"enforce_type_hints": True,
24+
# An envvar key for determining the project id to use. Change it
25+
# to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
26+
# build specific Cloud project. You can also use your own string
27+
# to use your own Cloud project.
28+
"gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
29+
# 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
30+
# If you need to use a specific version of pip,
31+
# change pip_version_override to the string representation
32+
# of the version number, for example, "20.2.4"
33+
"pip_version_override": None,
34+
# A dictionary you want to inject into your test. Don't put any
35+
# secrets here. These values will override predefined values.
36+
"envs": {},
37+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Flask==2.2.2
2+
functions-framework==3.3.0
3+
google-cloud-documentai==2.11.0
4+
pytest==7.2.0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Flask==2.2.2
2+
functions-framework==3.3.0
3+
google-cloud-documentai==2.11.0

0 commit comments

Comments
 (0)