forked from googleapis/python-bigquery-dataframes
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsql.py
More file actions
160 lines (131 loc) · 5.17 KB
/
sql.py
File metadata and controls
160 lines (131 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
"""
Utility functions for SQL construction.
"""
import datetime
import math
from typing import Iterable, Mapping, TYPE_CHECKING, Union
import bigframes.core.compile.googlesql as googlesql
if TYPE_CHECKING:
import google.cloud.bigquery as bigquery
import bigframes.core.ordering
### Writing SQL Values (literals, column references, table references, etc.)
def simple_literal(value: str | int | bool | float | datetime.datetime):
"""Return quoted input string."""
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals
if isinstance(value, str):
# Single quoting seems to work nicer with ibis than double quoting
return f"'{googlesql._escape_chars(value)}'"
elif isinstance(value, (bool, int)):
return str(value)
elif isinstance(value, float):
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals
if math.isnan(value):
return 'CAST("nan" as FLOAT)'
if value == math.inf:
return 'CAST("+inf" as FLOAT)'
if value == -math.inf:
return 'CAST("-inf" as FLOAT)'
return str(value)
if isinstance(value, datetime.datetime):
return f"TIMESTAMP('{value.isoformat()}')"
else:
raise ValueError(f"Cannot produce literal for {value}")
def multi_literal(*values: str):
literal_strings = [simple_literal(i) for i in values]
return "(" + ", ".join(literal_strings) + ")"
def cast_as_string(column_name: str) -> str:
"""Return a string representing string casting of a column."""
return googlesql.Cast(
googlesql.ColumnExpression(column_name), googlesql.DataType.STRING
).sql()
def to_json_string(column_name: str) -> str:
"""Return a string representing JSON version of a column."""
return f"TO_JSON_STRING({googlesql.identifier(column_name)})"
def csv(values: Iterable[str]) -> str:
"""Return a string of comma separated values."""
return ", ".join(values)
def infix_op(opname: str, left_arg: str, right_arg: str):
# Maybe should add parentheses??
return f"{left_arg} {opname} {right_arg}"
def is_distinct_sql(columns: Iterable[str], table_ref: bigquery.TableReference) -> str:
is_unique_sql = f"""WITH full_table AS (
{googlesql.Select().from_(table_ref).select(columns).sql()}
),
distinct_table AS (
{googlesql.Select().from_(table_ref).select(columns, distinct=True).sql()}
)
SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`,
(SELECT COUNT(*) FROM distinct_table) AS `distinct_count`
"""
return is_unique_sql
def ordering_clause(
ordering: Iterable[bigframes.core.ordering.OrderingExpression],
) -> str:
import bigframes.core.expression
parts = []
for col_ref in ordering:
asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC"
null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST"
ordering_expr = col_ref.scalar_expression
# We don't know how to compile scalar expressions in isolation
if ordering_expr.is_const:
# Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here.
continue
assert isinstance(ordering_expr, bigframes.core.expression.DerefOp)
part = f"`{ordering_expr.id.sql}` {asc_desc} {null_clause}"
parts.append(part)
return f"ORDER BY {' ,'.join(parts)}"
def create_vector_search_sql(
sql_string: str,
options: Mapping[str, Union[str | int | bool | float]] = {},
) -> str:
"""Encode the VECTOR SEARCH statement for BigQuery Vector Search."""
base_table = options["base_table"]
column_to_search = options["column_to_search"]
distance_type = options["distance_type"]
top_k = options["top_k"]
query_column_to_search = options.get("query_column_to_search", None)
if query_column_to_search is not None:
query_str = f"""
SELECT
query.*,
base.*,
distance,
FROM VECTOR_SEARCH(
TABLE `{base_table}`,
{simple_literal(column_to_search)},
({sql_string}),
{simple_literal(query_column_to_search)},
distance_type => {simple_literal(distance_type)},
top_k => {simple_literal(top_k)}
)
"""
else:
query_str = f"""
SELECT
query.*,
base.*,
distance,
FROM VECTOR_SEARCH(
TABLE `{base_table}`,
{simple_literal(column_to_search)},
({sql_string}),
distance_type => {simple_literal(distance_type)},
top_k => {simple_literal(top_k)}
)
"""
return query_str