From 9b4e5770ab21ad54f86e8c7dd5be62f82a5cf8a3 Mon Sep 17 00:00:00 2001 From: Madhavendra Rathore Date: Fri, 2 Jan 2026 01:18:14 +0530 Subject: [PATCH 01/24] Fixed the exception handler close() on _TelemetryClientHolder (#723) Fixed the exception handler calls close() on _TelemetryClientHolder objects instead of accessing the client inside them. --- src/databricks/sql/telemetry/telemetry_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/sql/telemetry/telemetry_client.py b/src/databricks/sql/telemetry/telemetry_client.py index 77d1a2f9c..9a38776b1 100644 --- a/src/databricks/sql/telemetry/telemetry_client.py +++ b/src/databricks/sql/telemetry/telemetry_client.py @@ -542,8 +542,8 @@ def _handle_unhandled_exception(cls, exc_type, exc_value, exc_traceback): logger.debug("Handling unhandled exception: %s", exc_type.__name__) clients_to_close = list(cls._clients.values()) - for client in clients_to_close: - client.close() + for holder in clients_to_close: + holder.client.close() # Call the original exception handler to maintain normal behavior if cls._original_excepthook: From 946a265a3d9a1a86690a795b9d4e388e36be3bff Mon Sep 17 00:00:00 2001 From: nikhilsuri-db Date: Mon, 5 Jan 2026 13:17:03 +0530 Subject: [PATCH 02/24] created util method to normalise http protocol in http path (#724) * created util method to normalise http protocol in http path Signed-off-by: Nikhil Suri * Added impacted files using util method Signed-off-by: Nikhil Suri * Fixed linting issues Signed-off-by: Nikhil Suri * fixed broken test with mock host string Signed-off-by: Nikhil Suri * mocked http client Signed-off-by: Nikhil Suri * made case sensitive check in url utils Signed-off-by: Nikhil Suri * linting issue resolved Signed-off-by: Nikhil Suri * removed unnecessary md files Signed-off-by: Nikhil Suri * made test readbale Signed-off-by: Nikhil Suri * changes done in auth util as well as sea http Signed-off-by: Nikhil Suri --------- Signed-off-by: Nikhil Suri --- src/databricks/sql/auth/auth_utils.py | 17 ---- src/databricks/sql/auth/token_federation.py | 6 +- .../sql/backend/sea/utils/http_client.py | 6 +- src/databricks/sql/common/feature_flag.py | 4 +- src/databricks/sql/common/url_utils.py | 45 +++++++++++ .../sql/telemetry/telemetry_client.py | 3 +- tests/e2e/test_circuit_breaker.py | 79 ++++++++++++++----- tests/unit/test_client.py | 36 ++++++++- tests/unit/test_sea_http_client.py | 34 ++++++++ tests/unit/test_token_federation.py | 24 +++--- tests/unit/test_url_utils.py | 41 ++++++++++ 11 files changed, 234 insertions(+), 61 deletions(-) create mode 100644 src/databricks/sql/common/url_utils.py create mode 100644 tests/unit/test_url_utils.py diff --git a/src/databricks/sql/auth/auth_utils.py b/src/databricks/sql/auth/auth_utils.py index 439aabc51..a21ce843b 100644 --- a/src/databricks/sql/auth/auth_utils.py +++ b/src/databricks/sql/auth/auth_utils.py @@ -7,23 +7,6 @@ logger = logging.getLogger(__name__) -def parse_hostname(hostname: str) -> str: - """ - Normalize the hostname to include scheme and trailing slash. - - Args: - hostname: The hostname to normalize - - Returns: - Normalized hostname with scheme and trailing slash - """ - if not hostname.startswith("http://") and not hostname.startswith("https://"): - hostname = f"https://{hostname}" - if not hostname.endswith("/"): - hostname = f"{hostname}/" - return hostname - - def decode_token(access_token: str) -> Optional[Dict]: """ Decode a JWT token without verification to extract claims. diff --git a/src/databricks/sql/auth/token_federation.py b/src/databricks/sql/auth/token_federation.py index 7b62f6762..f75b904fb 100644 --- a/src/databricks/sql/auth/token_federation.py +++ b/src/databricks/sql/auth/token_federation.py @@ -6,10 +6,10 @@ from databricks.sql.auth.authenticators import AuthProvider from databricks.sql.auth.auth_utils import ( - parse_hostname, decode_token, is_same_host, ) +from databricks.sql.common.url_utils import normalize_host_with_protocol from databricks.sql.common.http import HttpMethod logger = logging.getLogger(__name__) @@ -99,7 +99,7 @@ def __init__( if not http_client: raise ValueError("http_client is required for TokenFederationProvider") - self.hostname = parse_hostname(hostname) + self.hostname = normalize_host_with_protocol(hostname) self.external_provider = external_provider self.http_client = http_client self.identity_federation_client_id = identity_federation_client_id @@ -164,7 +164,7 @@ def _should_exchange_token(self, access_token: str) -> bool: def _exchange_token(self, access_token: str) -> Token: """Exchange the external token for a Databricks token.""" - token_url = f"{self.hostname.rstrip('/')}{self.TOKEN_EXCHANGE_ENDPOINT}" + token_url = f"{self.hostname}{self.TOKEN_EXCHANGE_ENDPOINT}" data = { "grant_type": self.TOKEN_EXCHANGE_GRANT_TYPE, diff --git a/src/databricks/sql/backend/sea/utils/http_client.py b/src/databricks/sql/backend/sea/utils/http_client.py index b47f2add2..caefe9929 100644 --- a/src/databricks/sql/backend/sea/utils/http_client.py +++ b/src/databricks/sql/backend/sea/utils/http_client.py @@ -18,6 +18,7 @@ from databricks.sql.common.http_utils import ( detect_and_parse_proxy, ) +from databricks.sql.common.url_utils import normalize_host_with_protocol logger = logging.getLogger(__name__) @@ -66,8 +67,9 @@ def __init__( self.auth_provider = auth_provider self.ssl_options = ssl_options - # Build base URL - self.base_url = f"https://{server_hostname}:{self.port}" + # Build base URL using url_utils for consistent normalization + normalized_host = normalize_host_with_protocol(server_hostname) + self.base_url = f"{normalized_host}:{self.port}" # Parse URL for proxy handling parsed_url = urllib.parse.urlparse(self.base_url) diff --git a/src/databricks/sql/common/feature_flag.py b/src/databricks/sql/common/feature_flag.py index 032701f63..36e4b8a02 100644 --- a/src/databricks/sql/common/feature_flag.py +++ b/src/databricks/sql/common/feature_flag.py @@ -6,6 +6,7 @@ from typing import Dict, Optional, List, Any, TYPE_CHECKING from databricks.sql.common.http import HttpMethod +from databricks.sql.common.url_utils import normalize_host_with_protocol if TYPE_CHECKING: from databricks.sql.client import Connection @@ -67,7 +68,8 @@ def __init__( endpoint_suffix = FEATURE_FLAGS_ENDPOINT_SUFFIX_FORMAT.format(__version__) self._feature_flag_endpoint = ( - f"https://{self._connection.session.host}{endpoint_suffix}" + normalize_host_with_protocol(self._connection.session.host) + + endpoint_suffix ) # Use the provided HTTP client diff --git a/src/databricks/sql/common/url_utils.py b/src/databricks/sql/common/url_utils.py new file mode 100644 index 000000000..1c4d10369 --- /dev/null +++ b/src/databricks/sql/common/url_utils.py @@ -0,0 +1,45 @@ +""" +URL utility functions for the Databricks SQL connector. +""" + + +def normalize_host_with_protocol(host: str) -> str: + """ + Normalize a connection hostname by ensuring it has a protocol. + + This is useful for handling cases where users may provide hostnames with or without protocols + (common with dbt-databricks users copying URLs from their browser). + + Args: + host: Connection hostname which may or may not include a protocol prefix (https:// or http://) + and may or may not have a trailing slash + + Returns: + Normalized hostname with protocol prefix and no trailing slashes + + Examples: + normalize_host_with_protocol("myserver.com") -> "https://myserver.com" + normalize_host_with_protocol("https://myserver.com") -> "https://myserver.com" + normalize_host_with_protocol("HTTPS://myserver.com/") -> "https://myserver.com" + normalize_host_with_protocol("http://localhost:8080/") -> "http://localhost:8080" + + Raises: + ValueError: If host is None or empty string + """ + # Handle None or empty host + if not host or not host.strip(): + raise ValueError("Host cannot be None or empty") + + # Remove trailing slashes + host = host.rstrip("/") + + # Add protocol if not present (case-insensitive check) + host_lower = host.lower() + if not host_lower.startswith("https://") and not host_lower.startswith("http://"): + host = f"https://{host}" + elif host_lower.startswith("https://") or host_lower.startswith("http://"): + # Normalize protocol to lowercase + protocol_end = host.index("://") + 3 + host = host[:protocol_end].lower() + host[protocol_end:] + + return host diff --git a/src/databricks/sql/telemetry/telemetry_client.py b/src/databricks/sql/telemetry/telemetry_client.py index 9a38776b1..523fcc1dc 100644 --- a/src/databricks/sql/telemetry/telemetry_client.py +++ b/src/databricks/sql/telemetry/telemetry_client.py @@ -47,6 +47,7 @@ TelemetryPushClient, CircuitBreakerTelemetryPushClient, ) +from databricks.sql.common.url_utils import normalize_host_with_protocol if TYPE_CHECKING: from databricks.sql.client import Connection @@ -278,7 +279,7 @@ def _send_telemetry(self, events): if self._auth_provider else self.TELEMETRY_UNAUTHENTICATED_PATH ) - url = f"https://{self._host_url}{path}" + url = normalize_host_with_protocol(self._host_url) + path headers = {"Accept": "application/json", "Content-Type": "application/json"} diff --git a/tests/e2e/test_circuit_breaker.py b/tests/e2e/test_circuit_breaker.py index 45c494d19..f9df0f377 100644 --- a/tests/e2e/test_circuit_breaker.py +++ b/tests/e2e/test_circuit_breaker.py @@ -23,6 +23,34 @@ from databricks.sql.telemetry.circuit_breaker_manager import CircuitBreakerManager +def wait_for_circuit_state(circuit_breaker, expected_states, timeout=5): + """ + Wait for circuit breaker to reach one of the expected states with polling. + + Args: + circuit_breaker: The circuit breaker instance to monitor + expected_states: List of acceptable states + (STATE_OPEN, STATE_CLOSED, STATE_HALF_OPEN) + timeout: Maximum time to wait in seconds + + Returns: + True if state reached, False if timeout + + Examples: + # Single state - pass list with one element + wait_for_circuit_state(cb, [STATE_OPEN]) + + # Multiple states + wait_for_circuit_state(cb, [STATE_CLOSED, STATE_HALF_OPEN]) + """ + start = time.time() + while time.time() - start < timeout: + if circuit_breaker.current_state in expected_states: + return True + time.sleep(0.1) # Poll every 100ms + return False + + @pytest.fixture(autouse=True) def aggressive_circuit_breaker_config(): """ @@ -65,12 +93,17 @@ def create_mock_response(self, status_code): }.get(status_code, b"Response") return response - @pytest.mark.parametrize("status_code,should_trigger", [ - (429, True), - (503, True), - (500, False), - ]) - def test_circuit_breaker_triggers_for_rate_limit_codes(self, status_code, should_trigger): + @pytest.mark.parametrize( + "status_code,should_trigger", + [ + (429, True), + (503, True), + (500, False), + ], + ) + def test_circuit_breaker_triggers_for_rate_limit_codes( + self, status_code, should_trigger + ): """ Verify circuit breaker opens for rate-limit codes (429/503) but not others (500). """ @@ -107,9 +140,14 @@ def mock_request(*args, **kwargs): time.sleep(0.5) if should_trigger: - # Circuit should be OPEN after 2 rate-limit failures + # Wait for circuit to open (async telemetry may take time) + assert wait_for_circuit_state( + circuit_breaker, [STATE_OPEN], timeout=5 + ), f"Circuit didn't open within 5s, state: {circuit_breaker.current_state}" + + # Circuit should be OPEN after rate-limit failures assert circuit_breaker.current_state == STATE_OPEN - assert circuit_breaker.fail_counter == 2 + assert circuit_breaker.fail_counter >= 2 # At least 2 failures # Track requests before another query requests_before = request_count["count"] @@ -197,7 +235,10 @@ def mock_conditional_request(*args, **kwargs): cursor.fetchone() time.sleep(2) - assert circuit_breaker.current_state == STATE_OPEN + # Wait for circuit to open + assert wait_for_circuit_state( + circuit_breaker, [STATE_OPEN], timeout=5 + ), f"Circuit didn't open, state: {circuit_breaker.current_state}" # Wait for reset timeout (5 seconds in test) time.sleep(6) @@ -208,24 +249,20 @@ def mock_conditional_request(*args, **kwargs): # Execute query to trigger HALF_OPEN state cursor.execute("SELECT 3") cursor.fetchone() - time.sleep(1) - # Circuit should be recovering - assert circuit_breaker.current_state in [ - STATE_HALF_OPEN, - STATE_CLOSED, - ], f"Circuit should be recovering, but is {circuit_breaker.current_state}" + # Wait for circuit to start recovering + assert wait_for_circuit_state( + circuit_breaker, [STATE_HALF_OPEN, STATE_CLOSED], timeout=5 + ), f"Circuit didn't recover, state: {circuit_breaker.current_state}" # Execute more queries to fully recover cursor.execute("SELECT 4") cursor.fetchone() - time.sleep(1) - current_state = circuit_breaker.current_state - assert current_state in [ - STATE_CLOSED, - STATE_HALF_OPEN, - ], f"Circuit should recover to CLOSED or HALF_OPEN, got {current_state}" + # Wait for full recovery + assert wait_for_circuit_state( + circuit_breaker, [STATE_CLOSED, STATE_HALF_OPEN], timeout=5 + ), f"Circuit didn't fully recover, state: {circuit_breaker.current_state}" if __name__ == "__main__": diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 8f8a97eae..5b6991931 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -646,13 +646,31 @@ class TransactionTestSuite(unittest.TestCase): "access_token": "tok", } + def _setup_mock_session_with_http_client(self, mock_session): + """ + Helper to configure a mock session with HTTP client mocks. + This prevents feature flag network requests during Connection initialization. + """ + mock_session.host = "foo" + + # Mock HTTP client to prevent feature flag network requests + mock_http_client = Mock() + mock_session.http_client = mock_http_client + + # Mock feature flag response to prevent blocking HTTP calls + mock_ff_response = Mock() + mock_ff_response.status = 200 + mock_ff_response.data = b'{"flags": [], "ttl_seconds": 900}' + mock_http_client.request.return_value = mock_ff_response + def _create_mock_connection(self, mock_session_class): """Helper to create a mocked connection for transaction tests.""" - # Mock session mock_session = Mock() mock_session.is_open = True mock_session.guid_hex = "test-session-id" mock_session.get_autocommit.return_value = True + + self._setup_mock_session_with_http_client(mock_session) mock_session_class.return_value = mock_session # Create connection with ignore_transactions=False to test actual transaction functionality @@ -736,9 +754,7 @@ def test_autocommit_setter_preserves_exception_chain(self, mock_session_class): conn = self._create_mock_connection(mock_session_class) mock_cursor = Mock() - original_error = DatabaseError( - "Original error", host_url="test-host" - ) + original_error = DatabaseError("Original error", host_url="test-host") mock_cursor.execute.side_effect = original_error with patch.object(conn, "cursor", return_value=mock_cursor): @@ -927,6 +943,8 @@ def test_fetch_autocommit_from_server_queries_server(self, mock_session_class): mock_session = Mock() mock_session.is_open = True mock_session.guid_hex = "test-session-id" + + self._setup_mock_session_with_http_client(mock_session) mock_session_class.return_value = mock_session conn = client.Connection( @@ -959,6 +977,8 @@ def test_fetch_autocommit_from_server_handles_false_value(self, mock_session_cla mock_session = Mock() mock_session.is_open = True mock_session.guid_hex = "test-session-id" + + self._setup_mock_session_with_http_client(mock_session) mock_session_class.return_value = mock_session conn = client.Connection( @@ -986,6 +1006,8 @@ def test_fetch_autocommit_from_server_raises_on_no_result(self, mock_session_cla mock_session = Mock() mock_session.is_open = True mock_session.guid_hex = "test-session-id" + + self._setup_mock_session_with_http_client(mock_session) mock_session_class.return_value = mock_session conn = client.Connection( @@ -1015,6 +1037,8 @@ def test_commit_is_noop_when_ignore_transactions_true(self, mock_session_class): mock_session = Mock() mock_session.is_open = True mock_session.guid_hex = "test-session-id" + + self._setup_mock_session_with_http_client(mock_session) mock_session_class.return_value = mock_session # Create connection with ignore_transactions=True (default) @@ -1043,6 +1067,8 @@ def test_rollback_raises_not_supported_when_ignore_transactions_true( mock_session = Mock() mock_session.is_open = True mock_session.guid_hex = "test-session-id" + + self._setup_mock_session_with_http_client(mock_session) mock_session_class.return_value = mock_session # Create connection with ignore_transactions=True (default) @@ -1068,6 +1094,8 @@ def test_autocommit_setter_is_noop_when_ignore_transactions_true( mock_session = Mock() mock_session.is_open = True mock_session.guid_hex = "test-session-id" + + self._setup_mock_session_with_http_client(mock_session) mock_session_class.return_value = mock_session # Create connection with ignore_transactions=True (default) diff --git a/tests/unit/test_sea_http_client.py b/tests/unit/test_sea_http_client.py index 39ecb58a7..5100a5cb0 100644 --- a/tests/unit/test_sea_http_client.py +++ b/tests/unit/test_sea_http_client.py @@ -44,6 +44,40 @@ def sea_http_client(self, mock_auth_provider, ssl_options): client._pool = Mock() return client + @pytest.mark.parametrize( + "server_hostname,port,expected_base_url", + [ + # Basic hostname without protocol + ("myserver.com", 443, "https://myserver.com:443"), + # Hostname with trailing slash + ("myserver.com/", 443, "https://myserver.com:443"), + # Hostname with https:// protocol + ("https://myserver.com", 443, "https://myserver.com:443"), + # Hostname with http:// protocol (preserved as-is) + ("http://myserver.com", 443, "http://myserver.com:443"), + # Hostname with protocol and trailing slash + ("https://myserver.com/", 443, "https://myserver.com:443"), + # Custom port + ("myserver.com", 8080, "https://myserver.com:8080"), + # Protocol with custom port + ("https://myserver.com", 8080, "https://myserver.com:8080"), + ], + ) + def test_base_url_construction( + self, server_hostname, port, expected_base_url, mock_auth_provider, ssl_options + ): + """Test that base_url is constructed correctly from various hostname inputs.""" + with patch("databricks.sql.backend.sea.utils.http_client.HTTPSConnectionPool"): + client = SeaHttpClient( + server_hostname=server_hostname, + port=port, + http_path="/sql/1.0/warehouses/test", + http_headers=[], + auth_provider=mock_auth_provider, + ssl_options=ssl_options, + ) + assert client.base_url == expected_base_url + def test_get_command_type_from_path(self, sea_http_client): """Test the _get_command_type_from_path method with various paths and methods.""" # Test statement execution diff --git a/tests/unit/test_token_federation.py b/tests/unit/test_token_federation.py index 2e671c33e..9c209e894 100644 --- a/tests/unit/test_token_federation.py +++ b/tests/unit/test_token_federation.py @@ -6,10 +6,10 @@ from databricks.sql.auth.token_federation import TokenFederationProvider, Token from databricks.sql.auth.auth_utils import ( - parse_hostname, decode_token, is_same_host, ) +from databricks.sql.common.url_utils import normalize_host_with_protocol from databricks.sql.common.http import HttpMethod @@ -78,10 +78,10 @@ def test_init_requires_http_client(self, mock_external_provider): @pytest.mark.parametrize( "input_hostname,expected", [ - ("test.databricks.com", "https://test.databricks.com/"), - ("https://test.databricks.com", "https://test.databricks.com/"), - ("https://test.databricks.com/", "https://test.databricks.com/"), - ("test.databricks.com/", "https://test.databricks.com/"), + ("test.databricks.com", "https://test.databricks.com"), + ("https://test.databricks.com", "https://test.databricks.com"), + ("https://test.databricks.com/", "https://test.databricks.com"), + ("test.databricks.com/", "https://test.databricks.com"), ], ) def test_hostname_normalization( @@ -305,15 +305,15 @@ class TestUtilityFunctions: @pytest.mark.parametrize( "input_hostname,expected", [ - ("test.databricks.com", "https://test.databricks.com/"), - ("https://test.databricks.com", "https://test.databricks.com/"), - ("https://test.databricks.com/", "https://test.databricks.com/"), - ("test.databricks.com/", "https://test.databricks.com/"), + ("test.databricks.com", "https://test.databricks.com"), + ("https://test.databricks.com", "https://test.databricks.com"), + ("https://test.databricks.com/", "https://test.databricks.com"), + ("test.databricks.com/", "https://test.databricks.com"), ], ) - def test_parse_hostname(self, input_hostname, expected): - """Test hostname parsing.""" - assert parse_hostname(input_hostname) == expected + def test_normalize_hostname(self, input_hostname, expected): + """Test hostname normalization.""" + assert normalize_host_with_protocol(input_hostname) == expected @pytest.mark.parametrize( "url1,url2,expected", diff --git a/tests/unit/test_url_utils.py b/tests/unit/test_url_utils.py new file mode 100644 index 000000000..95f42408d --- /dev/null +++ b/tests/unit/test_url_utils.py @@ -0,0 +1,41 @@ +"""Tests for URL utility functions.""" +import pytest +from databricks.sql.common.url_utils import normalize_host_with_protocol + + +class TestNormalizeHostWithProtocol: + """Tests for normalize_host_with_protocol function.""" + + @pytest.mark.parametrize( + "input_url,expected_output", + [ + ("myserver.com", "https://myserver.com"), # Add https:// + ("https://myserver.com", "https://myserver.com"), # No duplicate + ("http://localhost:8080", "http://localhost:8080"), # Preserve http:// + ("myserver.com:443", "https://myserver.com:443"), # With port + ("myserver.com/", "https://myserver.com"), # Remove trailing slash + ("https://myserver.com///", "https://myserver.com"), # Multiple slashes + ("HTTPS://MyServer.COM", "https://MyServer.COM"), # Case handling + ], + ) + def test_normalize_host_with_protocol(self, input_url, expected_output): + """Test host normalization with various input formats.""" + result = normalize_host_with_protocol(input_url) + assert result == expected_output + + # Additional assertions + assert result.startswith("https://") or result.startswith("http://") + assert not result.endswith("/") + + @pytest.mark.parametrize( + "invalid_host", + [ + None, + "", + " ", # Whitespace only + ], + ) + def test_normalize_host_with_protocol_raises_on_invalid_input(self, invalid_host): + """Test that function raises ValueError for None or empty host.""" + with pytest.raises(ValueError, match="Host cannot be None or empty"): + normalize_host_with_protocol(invalid_host) From 03eb369882cfda15f98c653ec6c9654cee3220c9 Mon Sep 17 00:00:00 2001 From: Samikshya Chand <148681192+samikshya-db@users.noreply.github.com> Date: Thu, 8 Jan 2026 15:20:03 +0530 Subject: [PATCH 03/24] New minor version release 4.2.4 (#725) New minor version release --- CHANGELOG.md | 4 ++++ pyproject.toml | 2 +- src/databricks/sql/__init__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99e6ce839..41105b072 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Release History +# 4.2.4 (2026-01-07) +- Fixed the exception handler close() on _TelemetryClientHolder (databricks/databricks-sql-python#723 by @msrathore-db) +- Created util method to normalise http protocol in http path (databricks/databricks-sql-python#724 by @nikhilsuri-db) + # 4.2.3 (2025-12-18) - added pandas < 2.4.0 support and tests for py 3.14 (databricks/databricks-sql-python#720 by @sreekanth-db) - pandas 2.3.3 support for py < 3.14 (databricks/databricks-sql-python#721 by @sreekanth-db) diff --git a/pyproject.toml b/pyproject.toml index 87312530b..8a635588c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "databricks-sql-connector" -version = "4.2.3" +version = "4.2.4" description = "Databricks SQL Connector for Python" authors = ["Databricks "] license = "Apache-2.0" diff --git a/src/databricks/sql/__init__.py b/src/databricks/sql/__init__.py index 49e1f9ee0..41784bc45 100644 --- a/src/databricks/sql/__init__.py +++ b/src/databricks/sql/__init__.py @@ -71,7 +71,7 @@ def __repr__(self): DATE = DBAPITypeObject("date") ROWID = DBAPITypeObject() -__version__ = "4.2.3" +__version__ = "4.2.4" USER_AGENT_NAME = "PyDatabricksSqlConnector" # These two functions are pyhive legacy From 4b7df5b0fd4da7e9caecbd8042c12e363c6d3d5f Mon Sep 17 00:00:00 2001 From: Sreekanth Date: Sat, 10 Jan 2026 01:39:01 +0530 Subject: [PATCH 04/24] [PECOBLR-1168] query tags telemetry (#716) * query tags telemetry Signed-off-by: Sreekanth Vadigi * code linting fix Signed-off-by: Sreekanth Vadigi --------- Signed-off-by: Sreekanth Vadigi --- src/databricks/sql/client.py | 2 ++ src/databricks/sql/telemetry/models/event.py | 2 ++ src/databricks/sql/utils.py | 15 +++++++++++++++ tests/unit/test_telemetry.py | 2 ++ 4 files changed, 21 insertions(+) diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py index 1f17d54f2..a0215aae5 100755 --- a/src/databricks/sql/client.py +++ b/src/databricks/sql/client.py @@ -35,6 +35,7 @@ ColumnTable, ColumnQueue, build_client_context, + get_session_config_value, ) from databricks.sql.parameters.native import ( DbsqlParameterBase, @@ -386,6 +387,7 @@ def read(self) -> Optional[OAuthToken]: support_many_parameters=True, # Native parameters supported enable_complex_datatype_support=_use_arrow_native_complex_types, allowed_volume_ingestion_paths=self.staging_allowed_local_path, + query_tags=get_session_config_value(session_configuration, "query_tags"), ) self._telemetry_client.export_initial_telemetry_log( diff --git a/src/databricks/sql/telemetry/models/event.py b/src/databricks/sql/telemetry/models/event.py index 2e6f63a6f..4d5a45038 100644 --- a/src/databricks/sql/telemetry/models/event.py +++ b/src/databricks/sql/telemetry/models/event.py @@ -57,6 +57,7 @@ class DriverConnectionParameters(JsonSerializableMixin): support_many_parameters (bool): Whether many parameters are supported enable_complex_datatype_support (bool): Whether complex datatypes are supported allowed_volume_ingestion_paths (str): Allowed paths for volume ingestion + query_tags (str): Query tags for tracking and attribution """ http_path: str @@ -84,6 +85,7 @@ class DriverConnectionParameters(JsonSerializableMixin): support_many_parameters: Optional[bool] = None enable_complex_datatype_support: Optional[bool] = None allowed_volume_ingestion_paths: Optional[str] = None + query_tags: Optional[str] = None @dataclass diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index b46784b10..043183ac2 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -38,6 +38,21 @@ logger = logging.getLogger(__name__) +def get_session_config_value( + session_configuration: Optional[Dict[str, Any]], key: str +) -> Optional[str]: + """Get a session configuration value with case-insensitive key matching""" + if not session_configuration: + return None + + key_upper = key.upper() + for k, v in session_configuration.items(): + if k.upper() == key_upper: + return str(v) if v is not None else None + + return None + + class ResultSetQueue(ABC): @abstractmethod def next_n_rows(self, num_rows: int): diff --git a/tests/unit/test_telemetry.py b/tests/unit/test_telemetry.py index e9fa16649..86f06aa8a 100644 --- a/tests/unit/test_telemetry.py +++ b/tests/unit/test_telemetry.py @@ -530,6 +530,7 @@ def test_driver_connection_parameters_all_fields(self): support_many_parameters=True, enable_complex_datatype_support=True, allowed_volume_ingestion_paths="/Volumes/catalog/schema/volume", + query_tags="team:engineering,project:telemetry", ) # Serialize to JSON and parse back @@ -562,6 +563,7 @@ def test_driver_connection_parameters_all_fields(self): assert json_dict["support_many_parameters"] is True assert json_dict["enable_complex_datatype_support"] is True assert json_dict["allowed_volume_ingestion_paths"] == "/Volumes/catalog/schema/volume" + assert json_dict["query_tags"] == "team:engineering,project:telemetry" def test_driver_connection_parameters_minimal_fields(self): """Test DriverConnectionParameters with only required fields.""" From cafed60ffc8cc7eff94ceee63e7a8b47eee522ea Mon Sep 17 00:00:00 2001 From: Samikshya Chand <148681192+samikshya-db@users.noreply.github.com> Date: Thu, 5 Feb 2026 10:47:11 +0530 Subject: [PATCH 05/24] [ES-1717039] Fix 60 seconds delay in gov cloud connections + Fix PR check failures in the repo (#735) * Fix 60 seconds delay in gov cloud connections * keep it simple :) * Add fix for krb error * pin poetry * Pin for publish flow too * Fix failing tests * Edit order for pypi * One last fix : pls work --- .github/workflows/code-coverage.yml | 5 +++ .github/workflows/code-quality-checks.yml | 8 +++++ .github/workflows/daily-telemetry-e2e.yml | 5 +++ .github/workflows/integration.yml | 5 +++ .github/workflows/publish-manual.yml | 9 ++++++ .github/workflows/publish-test.yml | 23 ++++++++++---- .github/workflows/publish.yml | 18 ++++++++--- src/databricks/sql/auth/retry.py | 34 +++++---------------- tests/e2e/common/retry_test_mixins.py | 26 +++++++--------- tests/e2e/common/staging_ingestion_tests.py | 2 +- tests/e2e/common/uc_volume_tests.py | 2 +- tests/unit/test_retry.py | 12 ++++++++ 12 files changed, 94 insertions(+), 55 deletions(-) diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 3c76be728..9cb68dbc9 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -36,6 +36,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -58,6 +59,10 @@ jobs: #---------------------------------------------- # install your root project, if required #---------------------------------------------- + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev - name: Install library run: poetry install --no-interaction --all-extras #---------------------------------------------- diff --git a/.github/workflows/code-quality-checks.yml b/.github/workflows/code-quality-checks.yml index 3c368abef..cc3952920 100644 --- a/.github/workflows/code-quality-checks.yml +++ b/.github/workflows/code-quality-checks.yml @@ -35,6 +35,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -118,6 +119,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -140,6 +142,10 @@ jobs: #---------------------------------------------- # install your root project, if required #---------------------------------------------- + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev - name: Install library run: poetry install --no-interaction --all-extras #---------------------------------------------- @@ -191,6 +197,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -243,6 +250,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true diff --git a/.github/workflows/daily-telemetry-e2e.yml b/.github/workflows/daily-telemetry-e2e.yml index 3d61cf177..d60b7f5a9 100644 --- a/.github/workflows/daily-telemetry-e2e.yml +++ b/.github/workflows/daily-telemetry-e2e.yml @@ -43,6 +43,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -60,6 +61,10 @@ jobs: #---------------------------------------------- # install dependencies if cache does not exist #---------------------------------------------- + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev - name: Install dependencies run: poetry install --no-interaction --all-extras diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index ad5369997..7fd6d98f1 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -33,6 +33,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -49,6 +50,10 @@ jobs: #---------------------------------------------- # install dependencies if cache does not exist #---------------------------------------------- + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev - name: Install dependencies run: poetry install --no-interaction --all-extras #---------------------------------------------- diff --git a/.github/workflows/publish-manual.yml b/.github/workflows/publish-manual.yml index ecad71a29..2f2a7a4dd 100644 --- a/.github/workflows/publish-manual.yml +++ b/.github/workflows/publish-manual.yml @@ -31,10 +31,19 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 # Install Poetry, the Python package manager with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true + #---------------------------------------------- + # Step 3.5: Install Kerberos system dependencies + #---------------------------------------------- + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev + # #---------------------------------------------- # # Step 4: Load cached virtual environment (if available) # #---------------------------------------------- diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml index 2e6359a78..ea4158934 100644 --- a/.github/workflows/publish-test.yml +++ b/.github/workflows/publish-test.yml @@ -21,6 +21,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -36,6 +37,10 @@ jobs: #---------------------------------------------- # install dependencies if cache does not exist #---------------------------------------------- + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev - name: Install dependencies if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction --no-root @@ -54,11 +59,17 @@ jobs: - name: Update pyproject.toml run: poetry version ${{ steps.version.outputs.major-version }}.${{ steps.version.outputs.minor-version }}.dev$(date +%s) #---------------------------------------------- + # Build the package (before publish action) + #---------------------------------------------- + - name: Build package + run: poetry build + #---------------------------------------------- + # Configure test-pypi repository + #---------------------------------------------- + - name: Configure test-pypi repository + run: poetry config repositories.testpypi https://test.pypi.org/legacy/ + #---------------------------------------------- # Attempt push to test-pypi #---------------------------------------------- - - name: Build and publish to pypi - uses: JRubics/poetry-publish@v1.10 - with: - pypi_token: ${{ secrets.TEST_PYPI_TOKEN }} - repository_name: "testpypi" - repository_url: "https://test.pypi.org/legacy/" + - name: Publish to test-pypi + run: poetry publish --username __token__ --password ${{ secrets.TEST_PYPI_TOKEN }} --repository testpypi diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index dde6cc2dc..b101f421c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -23,6 +23,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: "2.2.1" virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true @@ -38,6 +39,10 @@ jobs: #---------------------------------------------- # install dependencies if cache does not exist #---------------------------------------------- + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev - name: Install dependencies if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction --no-root @@ -56,9 +61,12 @@ jobs: - name: Update pyproject.toml run: poetry version ${{ steps.version.outputs.current-version }} #---------------------------------------------- - # Attempt push to test-pypi + # Build the package (before publish) #---------------------------------------------- - - name: Build and publish to pypi - uses: JRubics/poetry-publish@v1.10 - with: - pypi_token: ${{ secrets.PROD_PYPI_TOKEN }} + - name: Build package + run: poetry build + #---------------------------------------------- + # Publish to pypi + #---------------------------------------------- + - name: Publish to pypi + run: poetry publish --username __token__ --password ${{ secrets.PROD_PYPI_TOKEN }} diff --git a/src/databricks/sql/auth/retry.py b/src/databricks/sql/auth/retry.py index 4281883da..b0c2f497d 100755 --- a/src/databricks/sql/auth/retry.py +++ b/src/databricks/sql/auth/retry.py @@ -373,6 +373,13 @@ def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]: if status_code == 403: return False, "403 codes are not retried" + # Request failed with 404. Don't retry for any command type. + if status_code == 404: + return ( + False, + "Received 404 - NOT_FOUND. The requested resource does not exist.", + ) + # Request failed and server said NotImplemented. This isn't recoverable. Don't retry. if status_code == 501: return False, "Received code 501 from server." @@ -381,33 +388,6 @@ def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]: if not self._is_method_retryable(method): return False, "Only POST requests are retried" - # Request failed with 404 and was a GetOperationStatus. This is not recoverable. Don't retry. - if status_code == 404 and self.command_type == CommandType.GET_OPERATION_STATUS: - return ( - False, - "GetOperationStatus received 404 code from Databricks. Operation was canceled.", - ) - - # Request failed with 404 because CloseSession returns 404 if you repeat the request. - if ( - status_code == 404 - and self.command_type == CommandType.CLOSE_SESSION - and len(self.history) > 0 - ): - raise SessionAlreadyClosedError( - "CloseSession received 404 code from Databricks. Session is already closed." - ) - - # Request failed with 404 because CloseOperation returns 404 if you repeat the request. - if ( - status_code == 404 - and self.command_type == CommandType.CLOSE_OPERATION - and len(self.history) > 0 - ): - raise CursorAlreadyClosedError( - "CloseOperation received 404 code from Databricks. Cursor is already closed." - ) - # Request failed, was an ExecuteStatement and the command may have reached the server if ( self.command_type == CommandType.EXECUTE_STATEMENT diff --git a/tests/e2e/common/retry_test_mixins.py b/tests/e2e/common/retry_test_mixins.py index b2350bd98..80822ba47 100755 --- a/tests/e2e/common/retry_test_mixins.py +++ b/tests/e2e/common/retry_test_mixins.py @@ -278,7 +278,7 @@ def test_retry_max_count_not_exceeded(self, mock_send_telemetry, extra_params): THEN the connector issues six request (original plus five retries) before raising an exception """ - with mocked_server_response(status=404) as mock_obj: + with mocked_server_response(status=429, headers={"Retry-After": "0"}) as mock_obj: with pytest.raises(MaxRetryError) as cm: extra_params = {**extra_params, **self._retry_policy} with self.connection(extra_params=extra_params) as conn: @@ -467,22 +467,21 @@ def test_retry_safe_execute_statement_retry_condition(self, extra_params): ) def test_retry_abort_close_session_on_404(self, extra_params, caplog): """GIVEN the connector sends a CloseSession command - WHEN server sends a 404 (which is normally retried) - THEN nothing is retried because 404 means the session already closed + WHEN server sends a 404 (which is not retried since commit 41b28159) + THEN nothing is retried because 404 is globally non-retryable """ - # First response is a Bad Gateway -> Result is the command actually goes through - # Second response is a 404 because the session is no longer found + # With the idempotency-based retry refactor, 404 is now globally non-retryable + # regardless of command type. The close() method catches RequestError and proceeds. responses = [ - {"status": 502, "headers": {"Retry-After": "1"}, "redirect_location": None}, {"status": 404, "headers": {}, "redirect_location": None}, ] extra_params = {**extra_params, **self._retry_policy} with self.connection(extra_params=extra_params) as conn: with mock_sequential_server_responses(responses): + # Should not raise an exception, the error is caught internally conn.close() - assert "Session was closed by a prior request" in caplog.text @pytest.mark.parametrize( "extra_params", @@ -493,14 +492,13 @@ def test_retry_abort_close_session_on_404(self, extra_params, caplog): ) def test_retry_abort_close_operation_on_404(self, extra_params, caplog): """GIVEN the connector sends a CancelOperation command - WHEN server sends a 404 (which is normally retried) - THEN nothing is retried because 404 means the operation was already canceled + WHEN server sends a 404 (which is not retried since commit 41b28159) + THEN nothing is retried because 404 is globally non-retryable """ - # First response is a Bad Gateway -> Result is the command actually goes through - # Second response is a 404 because the session is no longer found + # With the idempotency-based retry refactor, 404 is now globally non-retryable + # regardless of command type. The close() method catches RequestError and proceeds. responses = [ - {"status": 502, "headers": {"Retry-After": "1"}, "redirect_location": None}, {"status": 404, "headers": {}, "redirect_location": None}, ] @@ -515,10 +513,8 @@ def test_retry_abort_close_operation_on_404(self, extra_params, caplog): # This call guarantees we have an open cursor at the server curs.execute("SELECT 1") with mock_sequential_server_responses(responses): + # Should not raise an exception, the error is caught internally curs.close() - assert ( - "Operation was canceled by a prior request" in caplog.text - ) @pytest.mark.parametrize( "extra_params", diff --git a/tests/e2e/common/staging_ingestion_tests.py b/tests/e2e/common/staging_ingestion_tests.py index 73aa0a113..a88f55238 100644 --- a/tests/e2e/common/staging_ingestion_tests.py +++ b/tests/e2e/common/staging_ingestion_tests.py @@ -81,7 +81,7 @@ def test_staging_ingestion_life_cycle(self, ingestion_user): # GET after REMOVE should fail with pytest.raises( - Error, match="too many 404 error responses" + Error, match="Staging operation over HTTP was unsuccessful: 404" ): cursor = conn.cursor() query = f"GET 'stage://tmp/{ingestion_user}/tmp/11/16/file1.csv' TO '{new_temp_path}'" diff --git a/tests/e2e/common/uc_volume_tests.py b/tests/e2e/common/uc_volume_tests.py index 93e63bd28..5b4086f91 100644 --- a/tests/e2e/common/uc_volume_tests.py +++ b/tests/e2e/common/uc_volume_tests.py @@ -81,7 +81,7 @@ def test_uc_volume_life_cycle(self, catalog, schema): # GET after REMOVE should fail with pytest.raises( - Error, match="too many 404 error responses" + Error, match="Staging operation over HTTP was unsuccessful: 404" ): cursor = conn.cursor() query = f"GET '/Volumes/{catalog}/{schema}/e2etests/file1.csv' TO '{new_temp_path}'" diff --git a/tests/unit/test_retry.py b/tests/unit/test_retry.py index 897a1d111..0d01d8675 100644 --- a/tests/unit/test_retry.py +++ b/tests/unit/test_retry.py @@ -83,3 +83,15 @@ def test_excessive_retry_attempts_error(self, t_mock, retry_policy): retry_policy.sleep(HTTPResponse(status=503)) # Internally urllib3 calls the increment function generating a new instance for every retry retry_policy = retry_policy.increment() + + def test_404_does_not_retry_for_any_command_type(self, retry_policy): + """Test that 404 never retries for any CommandType""" + retry_policy._retry_start_time = time.time() + + # Test for each CommandType + for command_type in CommandType: + retry_policy.command_type = command_type + should_retry, msg = retry_policy.should_retry("POST", 404) + + assert should_retry is False, f"404 should not retry for {command_type}" + assert "404" in msg or "NOT_FOUND" in msg From 61f80298e301e62c51d87e7b0f2427827c60f540 Mon Sep 17 00:00:00 2001 From: Madhavendra Rathore Date: Fri, 6 Feb 2026 21:30:31 +0530 Subject: [PATCH 06/24] [PECOBLR-1735] Fix #729 and #731: Telemetry lifecycle management (#734) * Fix #729 and #731: Telemetry lifecycle management Signed-off-by: Madhavendra Rathore * Address review comments: revert timeout and telemetry_enabled changes Per reviewer feedback on PR #734: 1. Revert timeout from 30s back to 900s (line 299) - Reviewer noted that with wait=False, timeout is not critical - The async nature and wait=False handle the exit speed 2. Revert telemetry_enabled parameter back to True (line 734) - Reviewer noted this is redundant given the early return - If enable_telemetry=False, we return early (line 729) - Line 734 only executes when enable_telemetry=True - Therefore using the parameter here is unnecessary These changes address the reviewer's valid technical concerns while keeping the core fixes intact: - wait=False for non-blocking shutdown (critical for Issue #729) - Early return when enable_telemetry=False (critical for Issue #729) - All Issue #731 fixes (null-safety, __del__, documentation) Signed-off-by: Madhavendra Rathore * Fix Black formatting violations Apply Black formatting to files modified in previous commits: - src/databricks/sql/common/unified_http_client.py - src/databricks/sql/telemetry/telemetry_client.py Changes are purely cosmetic (quote style consistency). Signed-off-by: Madhavendra Rathore * Fix CI test failure: Prevent parallel execution of telemetry tests Add @pytest.mark.xdist_group to telemetry test classes to ensure they run sequentially on the same worker when using pytest-xdist (-n auto). Root cause: Tests marked @pytest.mark.serial were still being parallelized in CI because pytest-xdist doesn't respect custom markers by default. With host-level telemetry batching (PR #718), tests running in parallel would share the same TelemetryClient and interfere with each other's event counting, causing test_concurrent_queries_sends_telemetry to see 88 events instead of the expected 60. The xdist_group marker ensures all tests in the "serial_telemetry" group run on the same worker sequentially, preventing state interference. Signed-off-by: Claude Sonnet 4.5 * Fix telemetry test fixtures: Clean up state before AND after tests Modified telemetry_setup_teardown fixtures to clean up TelemetryClientFactory state both BEFORE and AFTER each test, not just after. This prevents leftover state from previous tests (pending events, active executors) from interfering with the current test. Root cause: In CI with sequential execution on the same worker, if a previous test left pending telemetry events in the executor, those events could be captured by the next test's mock, causing inflated event counts (88 instead of 60). Now ensures complete isolation between tests by resetting all shared state before each test starts. Signed-off-by: Claude Sonnet 4.5 * Fix CI test failure: Clear _flush_event between tests The _flush_event threading.Event was never cleared after stopping the flush thread, remaining in "set" state. This caused timing issues in subsequent tests where the Event was already signaled, triggering unexpected flush behavior and causing extra telemetry events to be captured (88 instead of 60). Now explicitly clear the _flush_event flag in both setup (before test) and teardown (after test) to ensure clean state isolation between tests. This explains why CI consistently got 88 events - the flush_event from previous tests triggered additional flushes during test execution. Signed-off-by: Claude Sonnet 4.5 * Add debug workflow and output to diagnose CI test failure 1. Created new workflow 'test-telemetry-only.yml' that runs only the failing telemetry test with -n auto, mimicking real CI but much faster 2. Added debug output to test showing: - Client-side captured events - Number of futures/batches - Number of server responses - Server-reported successful events This will help identify why CI gets 88 events vs local 60 events. Signed-off-by: Claude Sonnet 4.5 * Fix workflow: Add krb5 system dependency The workflow was failing during poetry install due to missing krb5 system libraries needed for kerberos dependencies. Signed-off-by: Claude Sonnet 4.5 * Fix xdist_group: Add --dist=loadgroup to pytest commands The @pytest.mark.xdist_group markers were being ignored because pytest-xdist uses --dist=load by default, which doesn't respect groups. With --dist=loadgroup, tests in the same xdist_group run sequentially on the same worker, preventing telemetry state interference between tests. This is the ROOT CAUSE of the 88 vs 60 events issue - tests were running in parallel across workers instead of sequentially on one worker as intended. Signed-off-by: Claude Sonnet 4.5 * Add aggressive flush before test to prevent event interference CI shows 72 events instead of 60. Debug output reveals: - Client captured: 60 events (correct) - Server received: 72 events across 2 batches The 12 extra events accumulate in the timing window between fixture cleanup and mock setup. Other tests (like circuit breaker tests not in our xdist_group) may be sending telemetry concurrently. Solution: Add an explicit flush+shutdown RIGHT BEFORE setting up the mock to ensure a completely clean slate with zero buffered events. Signed-off-by: Claude Sonnet 4.5 * Split workflow: Isolate telemetry tests in separate job To prevent interference from other e2e tests, split into two jobs: Job 1 (run-non-telemetry-tests): - Runs all e2e tests EXCEPT telemetry tests - Uses -n auto for parallel execution Job 2 (run-telemetry-tests): - Runs ONLY telemetry tests - Depends on Job 1 completing (needs: run-non-telemetry-tests) - Fresh Python process = complete isolation - No ambient telemetry from other tests This eliminates the 68 vs 60 event discrepancy by ensuring telemetry tests run in a clean environment with zero interference. Signed-off-by: Claude Sonnet 4.5 * Fix workflows: Add krb5 deps and cleanup debug code Changes across multiple workflows: 1. integration.yml: - Add krb5 system dependency to telemetry job - Fixes: krb5-config command not found error during poetry install 2. code-coverage.yml: - Add krb5 system dependency - Split telemetry tests into separate step for isolation - Maintains coverage accumulation with --cov-append 3. publish-test.yml: - Add krb5 system dependency for consistent builds 4. test_concurrent_telemetry.py: - Remove debug print statements 5. Delete test-telemetry-only.yml: - Remove temporary debug workflow All workflows now have proper telemetry test isolation and required system dependencies for kerberos packages. Signed-off-by: Claude Sonnet 4.5 * Fix publish-test.yml: Update Python 3.9 -> 3.10 Poetry 2.3.2 installation fails with Python 3.9: Installing Poetry (2.3.2): An error occurred. Other workflows use Python 3.10 and work fine. Updating to match ensures consistency and avoids Poetry installation issues. Signed-off-by: Claude Sonnet 4.5 * Fix integration workflow: Remove --dist=loadgroup from non-telemetry tests - Remove --dist=loadgroup from non-telemetry job (only needed for telemetry) - Remove test_telemetry_e2e.py from telemetry job (was skipped before) - This should fix test_uc_volume_life_cycle failure caused by changed test distribution * Fix code-coverage workflow: Remove test_telemetry_e2e.py from coverage tests - Only run test_concurrent_telemetry.py in isolated telemetry step - test_telemetry_e2e.py was excluded in original workflow, keep it excluded * Fix publish-test workflow: Remove cache conditional - Always run poetry install (not just on cache miss) - Ensures fresh install with system dependencies (krb5) - Matches pattern used in integration.yml * Fix publish-test.yml: Remove duplicate krb5 install, restore cache conditional - Remove duplicate system dependencies step - Restore cache conditional to match main branch - Keep Python 3.10 (our change from 3.9) * Fix code-coverage: Remove serial tests step - All serial tests are telemetry tests (test_concurrent_telemetry.py and test_telemetry_e2e.py) - They're already run in the isolated telemetry step - Running -m serial with --ignore on both files results in 0 tests (exit code 5) --------- Signed-off-by: Madhavendra Rathore Signed-off-by: Claude Sonnet 4.5 --- .github/workflows/code-coverage.yml | 15 ++++-- .github/workflows/integration.yml | 52 +++++++++++++++++-- .github/workflows/publish-test.yml | 2 +- src/databricks/sql/client.py | 3 ++ .../sql/common/unified_http_client.py | 12 ++++- .../sql/telemetry/telemetry_client.py | 40 +++++++++++++- tests/e2e/test_concurrent_telemetry.py | 26 +++++++++- tests/e2e/test_telemetry_e2e.py | 28 ++++++++-- 8 files changed, 161 insertions(+), 17 deletions(-) diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 9cb68dbc9..5c961757e 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -31,6 +31,13 @@ jobs: with: python-version: "3.10" #---------------------------------------------- + # ----- install system dependencies ----- + #---------------------------------------------- + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev + #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry @@ -80,13 +87,13 @@ jobs: -v #---------------------------------------------- - # run serial tests with coverage + # run telemetry tests with coverage (isolated) #---------------------------------------------- - - name: Run serial tests with coverage + - name: Run telemetry tests with coverage (isolated) continue-on-error: false run: | - poetry run pytest tests/e2e \ - -m "serial" \ + # Run test_concurrent_telemetry.py separately for isolation + poetry run pytest tests/e2e/test_concurrent_telemetry.py \ --cov=src \ --cov-append \ --cov-report=xml \ diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 7fd6d98f1..c915ee6c1 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -7,7 +7,7 @@ on: pull_request: jobs: - run-e2e-tests: + run-non-telemetry-tests: runs-on: ubuntu-latest environment: azure-prod env: @@ -59,9 +59,53 @@ jobs: #---------------------------------------------- # run test suite #---------------------------------------------- - - name: Run e2e tests (excluding daily-only tests) + - name: Run non-telemetry e2e tests run: | - # Exclude telemetry E2E tests from PR runs (run daily instead) + # Exclude all telemetry tests - they run in separate job for isolation poetry run python -m pytest tests/e2e \ --ignore=tests/e2e/test_telemetry_e2e.py \ - -n auto \ No newline at end of file + --ignore=tests/e2e/test_concurrent_telemetry.py \ + -n auto + + run-telemetry-tests: + runs-on: ubuntu-latest + needs: run-non-telemetry-tests # Run after non-telemetry tests complete + environment: azure-prod + env: + DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} + DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }} + DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} + DATABRICKS_CATALOG: peco + DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: Set up python + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v4 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} + - name: Install dependencies + run: poetry install --no-interaction --all-extras + - name: Run telemetry tests in isolation + run: | + # Run test_concurrent_telemetry.py in isolation with complete process separation + # Use --dist=loadgroup to respect @pytest.mark.xdist_group markers + poetry run python -m pytest tests/e2e/test_concurrent_telemetry.py \ + -n auto --dist=loadgroup -v \ No newline at end of file diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml index ea4158934..97a444e68 100644 --- a/.github/workflows/publish-test.yml +++ b/.github/workflows/publish-test.yml @@ -14,7 +14,7 @@ jobs: id: setup-python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: "3.10" #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py index a0215aae5..1a246b7c1 100755 --- a/src/databricks/sql/client.py +++ b/src/databricks/sql/client.py @@ -306,6 +306,8 @@ def read(self) -> Optional[OAuthToken]: ) self.session.open() except Exception as e: + # Respect user's telemetry preference even during connection failure + enable_telemetry = kwargs.get("enable_telemetry", True) TelemetryClientFactory.connection_failure_log( error_name="Exception", error_message=str(e), @@ -316,6 +318,7 @@ def read(self) -> Optional[OAuthToken]: user_agent=self.session.useragent_header if hasattr(self, "session") else None, + enable_telemetry=enable_telemetry, ) raise e diff --git a/src/databricks/sql/common/unified_http_client.py b/src/databricks/sql/common/unified_http_client.py index d5f7d3c8d..ef55564c8 100644 --- a/src/databricks/sql/common/unified_http_client.py +++ b/src/databricks/sql/common/unified_http_client.py @@ -217,7 +217,7 @@ def _should_use_proxy(self, target_host: str) -> bool: logger.debug("Error checking proxy bypass for host %s: %s", target_host, e) return True - def _get_pool_manager_for_url(self, url: str) -> urllib3.PoolManager: + def _get_pool_manager_for_url(self, url: str) -> Optional[urllib3.PoolManager]: """ Get the appropriate pool manager for the given URL. @@ -225,7 +225,7 @@ def _get_pool_manager_for_url(self, url: str) -> urllib3.PoolManager: url: The target URL Returns: - PoolManager instance (either direct or proxy) + PoolManager instance (either direct or proxy), or None if client is closed """ parsed_url = urllib.parse.urlparse(url) target_host = parsed_url.hostname @@ -291,6 +291,14 @@ def request_context( # Select appropriate pool manager based on target URL pool_manager = self._get_pool_manager_for_url(url) + # DEFENSIVE: Check if pool_manager is None (client closing/closed) + # This prevents AttributeError race condition when telemetry cleanup happens + if pool_manager is None: + logger.debug( + "HTTP client closing or closed, cannot make request to %s", url + ) + raise RequestError("HTTP client is closing or has been closed") + response = None try: diff --git a/src/databricks/sql/telemetry/telemetry_client.py b/src/databricks/sql/telemetry/telemetry_client.py index 523fcc1dc..408162400 100644 --- a/src/databricks/sql/telemetry/telemetry_client.py +++ b/src/databricks/sql/telemetry/telemetry_client.py @@ -42,6 +42,7 @@ from databricks.sql.common.feature_flag import FeatureFlagsContextFactory from databricks.sql.common.unified_http_client import UnifiedHttpClient from databricks.sql.common.http import HttpMethod +from databricks.sql.exc import RequestError from databricks.sql.telemetry.telemetry_push_client import ( ITelemetryPushClient, TelemetryPushClient, @@ -417,10 +418,38 @@ def export_latency_log( ) def close(self): - """Flush remaining events before closing""" + """Flush remaining events before closing + + IMPORTANT: This method does NOT close self._http_client. + + Rationale: + - _flush() submits async work to the executor that uses _http_client + - If we closed _http_client here, async callbacks would fail with AttributeError + - Instead, we let _http_client live as long as needed: + * Pending futures hold references to self (via bound methods) + * This keeps self alive, which keeps self._http_client alive + * When all futures complete, Python GC will clean up naturally + - The __del__ method ensures eventual cleanup during garbage collection + + This design prevents race conditions while keeping telemetry truly async. + """ logger.debug("Closing TelemetryClient for connection %s", self._session_id_hex) self._flush() + def __del__(self): + """Cleanup when TelemetryClient is garbage collected + + This ensures _http_client is eventually closed when the TelemetryClient + object is destroyed. By this point, all async work should be complete + (since the futures held references keeping us alive), so it's safe to + close the http client. + """ + try: + if hasattr(self, "_http_client") and self._http_client: + self._http_client.close() + except Exception: + pass + class _TelemetryClientHolder: """ @@ -674,7 +703,8 @@ def close(host_url): ) try: TelemetryClientFactory._stop_flush_thread() - TelemetryClientFactory._executor.shutdown(wait=True) + # Use wait=False to allow process to exit immediately + TelemetryClientFactory._executor.shutdown(wait=False) except Exception as e: logger.debug("Failed to shutdown thread pool executor: %s", e) TelemetryClientFactory._executor = None @@ -689,9 +719,15 @@ def connection_failure_log( port: int, client_context, user_agent: Optional[str] = None, + enable_telemetry: bool = True, ): """Send error telemetry when connection creation fails, using provided client context""" + # Respect user's telemetry preference - don't force-enable + if not enable_telemetry: + logger.debug("Telemetry disabled, skipping connection failure log") + return + UNAUTH_DUMMY_SESSION_ID = "unauth_session_id" TelemetryClientFactory.initialize_telemetry_client( diff --git a/tests/e2e/test_concurrent_telemetry.py b/tests/e2e/test_concurrent_telemetry.py index bed348c2c..6a317cbfa 100644 --- a/tests/e2e/test_concurrent_telemetry.py +++ b/tests/e2e/test_concurrent_telemetry.py @@ -27,6 +27,7 @@ def run_in_threads(target, num_threads, pass_index=False): @pytest.mark.serial +@pytest.mark.xdist_group(name="serial_telemetry") class TestE2ETelemetry(PySQLPytestTestCase): @pytest.fixture(autouse=True) def telemetry_setup_teardown(self): @@ -35,13 +36,27 @@ def telemetry_setup_teardown(self): before each test and shuts it down afterward. Using a fixture makes this robust and automatic. """ + # Clean up BEFORE test starts to ensure no leftover state from previous tests + # Use wait=True to ensure all pending telemetry from previous tests completes + # This prevents those events from being captured by this test's mock + if TelemetryClientFactory._executor: + TelemetryClientFactory._executor.shutdown(wait=True) # WAIT for pending telemetry + TelemetryClientFactory._executor = None + TelemetryClientFactory._stop_flush_thread() + TelemetryClientFactory._flush_event.clear() # Clear the event flag + TelemetryClientFactory._clients.clear() + TelemetryClientFactory._initialized = False + try: yield finally: + # Clean up AFTER test ends + # Use wait=True to ensure this test's telemetry completes before next test starts if TelemetryClientFactory._executor: - TelemetryClientFactory._executor.shutdown(wait=True) + TelemetryClientFactory._executor.shutdown(wait=True) # WAIT for this test's telemetry TelemetryClientFactory._executor = None TelemetryClientFactory._stop_flush_thread() + TelemetryClientFactory._flush_event.clear() # Clear the event flag TelemetryClientFactory._clients.clear() TelemetryClientFactory._initialized = False @@ -50,6 +65,14 @@ def test_concurrent_queries_sends_telemetry(self): An E2E test where concurrent threads execute real queries against the staging endpoint, while we capture and verify the generated telemetry. """ + # Extra flush right before test starts to clear any events that accumulated + # between fixture cleanup and now (e.g., from other tests on same worker) + if TelemetryClientFactory._executor: + TelemetryClientFactory._executor.shutdown(wait=True) + TelemetryClientFactory._executor = None + TelemetryClientFactory._clients.clear() + TelemetryClientFactory._initialized = False + num_threads = 30 capture_lock = threading.Lock() captured_telemetry = [] @@ -139,6 +162,7 @@ def execute_query_worker(thread_id): assert "errors" not in response or not response["errors"] if "numProtoSuccess" in response: total_successful_events += response["numProtoSuccess"] + assert total_successful_events == num_threads * 2 assert ( diff --git a/tests/e2e/test_telemetry_e2e.py b/tests/e2e/test_telemetry_e2e.py index 0a57edd3c..83c2dbf81 100644 --- a/tests/e2e/test_telemetry_e2e.py +++ b/tests/e2e/test_telemetry_e2e.py @@ -44,23 +44,45 @@ def connection(self, extra_params=()): @pytest.mark.serial +@pytest.mark.xdist_group(name="serial_telemetry") class TestTelemetryE2E(TelemetryTestBase): """E2E tests for telemetry scenarios - must run serially due to shared host-level telemetry client""" @pytest.fixture(autouse=True) def telemetry_setup_teardown(self): """Clean up telemetry client state before and after each test""" + # Clean up BEFORE test starts + # Use wait=True to ensure all pending telemetry from previous tests completes + if TelemetryClientFactory._executor: + TelemetryClientFactory._executor.shutdown(wait=True) # WAIT for pending telemetry + TelemetryClientFactory._executor = None + TelemetryClientFactory._stop_flush_thread() + TelemetryClientFactory._flush_event.clear() # Clear the event flag + TelemetryClientFactory._clients.clear() + TelemetryClientFactory._initialized = False + + # Clear feature flags cache before test starts + from databricks.sql.common.feature_flag import FeatureFlagsContextFactory + with FeatureFlagsContextFactory._lock: + FeatureFlagsContextFactory._context_map.clear() + if FeatureFlagsContextFactory._executor: + FeatureFlagsContextFactory._executor.shutdown(wait=False) + FeatureFlagsContextFactory._executor = None + try: yield finally: + # Clean up AFTER test ends + # Use wait=True to ensure this test's telemetry completes if TelemetryClientFactory._executor: - TelemetryClientFactory._executor.shutdown(wait=True) + TelemetryClientFactory._executor.shutdown(wait=True) # WAIT for this test's telemetry TelemetryClientFactory._executor = None TelemetryClientFactory._stop_flush_thread() + TelemetryClientFactory._flush_event.clear() # Clear the event flag + TelemetryClientFactory._clients.clear() TelemetryClientFactory._initialized = False - # Clear feature flags cache to prevent state leakage between tests - from databricks.sql.common.feature_flag import FeatureFlagsContextFactory + # Clear feature flags cache after test ends with FeatureFlagsContextFactory._lock: FeatureFlagsContextFactory._context_map.clear() if FeatureFlagsContextFactory._executor: From 9fe7356a18d611ae18943c2f972160657b08eea2 Mon Sep 17 00:00:00 2001 From: jayant <167047871+jayantsing-db@users.noreply.github.com> Date: Mon, 9 Feb 2026 16:52:45 +0530 Subject: [PATCH 07/24] Bump to version 4.2.5 (#737) Signed-off-by: Jayant Singh --- CHANGELOG.md | 4 ++++ pyproject.toml | 4 ++-- src/databricks/sql/__init__.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41105b072..0ba3bb1a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Release History +# 4.2.5 (2026-02-09) +- Fix feature-flag endpoint retries in gov region (databricks/databricks-sql-python#735 by @samikshya-db) +- Improve telemetry lifecycle management (databricks/databricks-sql-python#734 by @msrathore-db) + # 4.2.4 (2026-01-07) - Fixed the exception handler close() on _TelemetryClientHolder (databricks/databricks-sql-python#723 by @msrathore-db) - Created util method to normalise http protocol in http path (databricks/databricks-sql-python#724 by @nikhilsuri-db) diff --git a/pyproject.toml b/pyproject.toml index 8a635588c..911f1b79c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "databricks-sql-connector" -version = "4.2.4" +version = "4.2.5" description = "Databricks SQL Connector for Python" authors = ["Databricks "] license = "Apache-2.0" @@ -92,4 +92,4 @@ show_missing = true skip_covered = false [tool.coverage.xml] -output = "coverage.xml" \ No newline at end of file +output = "coverage.xml" diff --git a/src/databricks/sql/__init__.py b/src/databricks/sql/__init__.py index 41784bc45..c9195b89f 100644 --- a/src/databricks/sql/__init__.py +++ b/src/databricks/sql/__init__.py @@ -71,7 +71,7 @@ def __repr__(self): DATE = DBAPITypeObject("date") ROWID = DBAPITypeObject() -__version__ = "4.2.4" +__version__ = "4.2.5" USER_AGENT_NAME = "PyDatabricksSqlConnector" # These two functions are pyhive legacy From 38097f26ea7c21c6a7e9672397ee0deb3843f0e3 Mon Sep 17 00:00:00 2001 From: Jiabin Hu Date: Mon, 2 Mar 2026 02:18:54 -0800 Subject: [PATCH 08/24] Add query_tags parameter support for execute methods (#736) * Add statement level query tag support by introducing it as a parameter on execute* methods Signed-off-by: Jiabin Hu * Add query_tags support to executemany method - Added query_tags parameter to executemany() method - Query tags are applied to all queries in the batch - Updated example to demonstrate executemany usage with query_tags - All tests pass (122/122 client tests) Signed-off-by: Jiabin Hu * add example that doesn't have tag Signed-off-by: Jiabin Hu * fix presubmit errors Signed-off-by: Jiabin Hu * another lint Signed-off-by: Jiabin Hu * address review comments Signed-off-by: Jiabin Hu --------- Signed-off-by: Jiabin Hu --- examples/query_tags_example.py | 97 ++++++++++++++++++- .../sql/backend/databricks_client.py | 2 + src/databricks/sql/backend/sea/backend.py | 3 + src/databricks/sql/backend/thrift_backend.py | 22 ++++- src/databricks/sql/client.py | 24 ++++- src/databricks/sql/utils.py | 43 ++++++++ tests/unit/test_util.py | 63 ++++++++++++ 7 files changed, 244 insertions(+), 10 deletions(-) diff --git a/examples/query_tags_example.py b/examples/query_tags_example.py index f615d082c..687ce4140 100644 --- a/examples/query_tags_example.py +++ b/examples/query_tags_example.py @@ -7,11 +7,23 @@ Query Tags are key-value pairs that can be attached to SQL executions and will appear in the system.query.history table for analytical purposes. -Format: "key1:value1,key2:value2,key3:value3" +There are two ways to set query tags: +1. Session-level: Set in session_configuration (applies to all queries in the session) +2. Per-query level: Pass query_tags parameter to execute() or execute_async() (applies to specific query) + +Format: Dictionary with string keys and optional string values +Example: {"team": "engineering", "application": "etl", "priority": "high"} + +Special cases: +- If a value is None, only the key is included (no colon or value) +- Special characters (comma, colon and backslash) in values are automatically escaped +- Keys are not escaped (should be controlled identifiers) """ print("=== Query Tags Example ===\n") +# Example 1: Session-level query tags (old approach) +print("Example 1: Session-level query tags") with sql.connect( server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"), http_path=os.getenv("DATABRICKS_HTTP_PATH"), @@ -21,10 +33,89 @@ 'ansi_mode': False } ) as connection: - + with connection.cursor() as cursor: cursor.execute("SELECT 1") result = cursor.fetchone() print(f" Result: {result[0]}") -print("\n=== Query Tags Example Complete ===") \ No newline at end of file +print() + +# Example 2: Per-query query tags (new approach) +print("Example 2: Per-query query tags") +with sql.connect( + server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"), + http_path=os.getenv("DATABRICKS_HTTP_PATH"), + access_token=os.getenv("DATABRICKS_TOKEN"), +) as connection: + + with connection.cursor() as cursor: + # Query 1: Tags for a critical ETL job + cursor.execute( + "SELECT 1", + query_tags={"team": "data-eng", "application": "etl", "priority": "high"} + ) + result = cursor.fetchone() + print(f" ETL Query Result: {result[0]}") + + # Query 2: Tags with None value (key-only tag) + cursor.execute( + "SELECT 2", + query_tags={"team": "analytics", "experimental": None} + ) + result = cursor.fetchone() + print(f" Experimental Query Result: {result[0]}") + + # Query 3: Tags with special characters (automatically escaped) + cursor.execute( + "SELECT 3", + query_tags={"description": "test:with:colons,and,commas"} + ) + result = cursor.fetchone() + print(f" Special Chars Query Result: {result[0]}") + + # Query 4: No tags (demonstrates tags don't persist from previous queries) + cursor.execute("SELECT 4") + result = cursor.fetchone() + print(f" No Tags Query Result: {result[0]}") + +print() + +# Example 3: Async execution with query tags +print("Example 3: Async execution with query tags") +with sql.connect( + server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"), + http_path=os.getenv("DATABRICKS_HTTP_PATH"), + access_token=os.getenv("DATABRICKS_TOKEN"), +) as connection: + + with connection.cursor() as cursor: + cursor.execute_async( + "SELECT 5", + query_tags={"team": "data-eng", "mode": "async"} + ) + cursor.get_async_execution_result() + result = cursor.fetchone() + print(f" Async Query Result: {result[0]}") + +print() + +# Example 4: executemany with query tags +print("Example 4: executemany with query tags") +with sql.connect( + server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"), + http_path=os.getenv("DATABRICKS_HTTP_PATH"), + access_token=os.getenv("DATABRICKS_TOKEN"), +) as connection: + + with connection.cursor() as cursor: + # Execute multiple queries with the same tags + cursor.executemany( + "SELECT ?", + [[6], [7], [8]], + query_tags={"team": "data-eng", "batch": "executemany"} + ) + result = cursor.fetchone() + print(f" Executemany Query Result (last): {result[0]}") + +print("\n=== Query Tags Example Complete ===") diff --git a/src/databricks/sql/backend/databricks_client.py b/src/databricks/sql/backend/databricks_client.py index 2213635fe..b772e7ddd 100644 --- a/src/databricks/sql/backend/databricks_client.py +++ b/src/databricks/sql/backend/databricks_client.py @@ -83,6 +83,7 @@ def execute_command( async_op: bool, enforce_embedded_schema_correctness: bool, row_limit: Optional[int] = None, + query_tags: Optional[Dict[str, Optional[str]]] = None, ) -> Union[ResultSet, None]: """ Executes a SQL command or query within the specified session. @@ -102,6 +103,7 @@ def execute_command( async_op: Whether to execute the command asynchronously enforce_embedded_schema_correctness: Whether to enforce schema correctness row_limit: Maximum number of rows in the response. + query_tags: Optional dictionary of query tags to apply for this query only. Returns: If async_op is False, returns a ResultSet object containing the diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py index 1427226d2..a6cff4913 100644 --- a/src/databricks/sql/backend/sea/backend.py +++ b/src/databricks/sql/backend/sea/backend.py @@ -463,6 +463,9 @@ def execute_command( async_op: bool, enforce_embedded_schema_correctness: bool, row_limit: Optional[int] = None, + query_tags: Optional[ + Dict[str, Optional[str]] + ] = None, # TODO: implement query_tags for SEA backend ) -> Union[SeaResultSet, None]: """ Execute a SQL command using the SEA backend. diff --git a/src/databricks/sql/backend/thrift_backend.py b/src/databricks/sql/backend/thrift_backend.py index edee02bfa..e23f3389b 100644 --- a/src/databricks/sql/backend/thrift_backend.py +++ b/src/databricks/sql/backend/thrift_backend.py @@ -5,7 +5,7 @@ import math import time import threading -from typing import List, Optional, Union, Any, TYPE_CHECKING +from typing import Dict, List, Optional, Union, Any, TYPE_CHECKING from uuid import UUID from databricks.sql.common.unified_http_client import UnifiedHttpClient @@ -53,6 +53,7 @@ convert_arrow_based_set_to_arrow_table, convert_decimals_in_arrow_table, convert_column_based_set_to_arrow_table, + serialize_query_tags, ) from databricks.sql.types import SSLOptions from databricks.sql.backend.databricks_client import DatabricksClient @@ -1003,6 +1004,7 @@ def execute_command( async_op=False, enforce_embedded_schema_correctness=False, row_limit: Optional[int] = None, + query_tags: Optional[Dict[str, Optional[str]]] = None, ) -> Union["ResultSet", None]: thrift_handle = session_id.to_thrift_handle() if not thrift_handle: @@ -1022,6 +1024,19 @@ def execute_command( # DBR should be changed to use month_day_nano_interval intervalTypesAsArrow=False, ) + + # Build confOverlay with default configs and query_tags + merged_conf_overlay = { + # We want to receive proper Timestamp arrow types. + "spark.thriftserver.arrowBasedRowSet.timestampAsString": "false" + } + + # Serialize and add query_tags to confOverlay if provided + if query_tags: + serialized_tags = serialize_query_tags(query_tags) + if serialized_tags: + merged_conf_overlay["query_tags"] = serialized_tags + req = ttypes.TExecuteStatementReq( sessionHandle=thrift_handle, statement=operation, @@ -1036,10 +1051,7 @@ def execute_command( canReadArrowResult=True if pyarrow else False, canDecompressLZ4Result=lz4_compression, canDownloadResult=use_cloud_fetch, - confOverlay={ - # We want to receive proper Timestamp arrow types. - "spark.thriftserver.arrowBasedRowSet.timestampAsString": "false" - }, + confOverlay=merged_conf_overlay, useArrowNativeTypes=spark_arrow_types, parameters=parameters, enforceEmbeddedSchemaCorrectness=enforce_embedded_schema_correctness, diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py index 1a246b7c1..efaf6ae4d 100755 --- a/src/databricks/sql/client.py +++ b/src/databricks/sql/client.py @@ -1263,6 +1263,7 @@ def execute( parameters: Optional[TParameterCollection] = None, enforce_embedded_schema_correctness=False, input_stream: Optional[BinaryIO] = None, + query_tags: Optional[Dict[str, Optional[str]]] = None, ) -> "Cursor": """ Execute a query and wait for execution to complete. @@ -1293,6 +1294,10 @@ def execute( Both will result in the query equivalent to "SELECT * FROM table WHERE field = 'foo' being sent to the server + :param query_tags: Optional dictionary of query tags to apply for this query only. + Tags are key-value pairs that can be used to identify and categorize queries. + Example: {"team": "data-eng", "application": "etl"} + :returns self """ @@ -1333,6 +1338,7 @@ def execute( async_op=False, enforce_embedded_schema_correctness=enforce_embedded_schema_correctness, row_limit=self.row_limit, + query_tags=query_tags, ) if self.active_result_set and self.active_result_set.is_staging_operation: @@ -1349,6 +1355,7 @@ def execute_async( operation: str, parameters: Optional[TParameterCollection] = None, enforce_embedded_schema_correctness=False, + query_tags: Optional[Dict[str, Optional[str]]] = None, ) -> "Cursor": """ @@ -1356,6 +1363,9 @@ def execute_async( :param operation: :param parameters: + :param query_tags: Optional dictionary of query tags to apply for this query only. + Tags are key-value pairs that can be used to identify and categorize queries. + Example: {"team": "data-eng", "application": "etl"} :return: """ @@ -1392,6 +1402,7 @@ def execute_async( async_op=True, enforce_embedded_schema_correctness=enforce_embedded_schema_correctness, row_limit=self.row_limit, + query_tags=query_tags, ) return self @@ -1448,7 +1459,12 @@ def get_async_execution_result(self): session_id_hex=self.connection.get_session_id_hex(), ) - def executemany(self, operation, seq_of_parameters): + def executemany( + self, + operation, + seq_of_parameters, + query_tags: Optional[Dict[str, Optional[str]]] = None, + ): """ Execute the operation once for every set of passed in parameters. @@ -1457,10 +1473,14 @@ def executemany(self, operation, seq_of_parameters): Only the final result set is retained. + :param query_tags: Optional dictionary of query tags to apply for all queries in this batch. + Tags are key-value pairs that can be used to identify and categorize queries. + Example: {"team": "data-eng", "application": "etl"} + :returns self """ for parameters in seq_of_parameters: - self.execute(operation, parameters) + self.execute(operation, parameters, query_tags=query_tags) return self @log_latency(StatementType.METADATA) diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index 043183ac2..125edbbaa 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -898,6 +898,49 @@ def concat_table_chunks( return pyarrow.concat_tables(table_chunks) +def serialize_query_tags( + query_tags: Optional[Dict[str, Optional[str]]] +) -> Optional[str]: + """ + Serialize query_tags dictionary to a string format. + + Format: "key1:value1,key2:value2" + Special cases: + - If value is None, omit the colon and value (e.g., "key1:value1,key2,key3:value3") + - Escape special characters (:, ,, \\) in values with a leading backslash + - Backslashes in keys are escaped; other special characters in keys are not escaped + + Args: + query_tags: Dictionary of query tags where keys are strings and values are optional strings + + Returns: + Serialized string or None if query_tags is None or empty + """ + if not query_tags: + return None + + def escape_value(value: str) -> str: + """Escape special characters in tag values.""" + # Escape backslash first to avoid double-escaping + value = value.replace("\\", r"\\") + # Escape colon and comma + value = value.replace(":", r"\:") + value = value.replace(",", r"\,") + return value + + serialized_parts = [] + for key, value in query_tags.items(): + escaped_key = key.replace("\\", r"\\") + if value is None: + # No colon or value when value is None + serialized_parts.append(escaped_key) + else: + escaped_value = escape_value(value) + serialized_parts.append(f"{escaped_key}:{escaped_value}") + + return ",".join(serialized_parts) + + def build_client_context(server_hostname: str, version: str, **kwargs): """Build ClientContext for HTTP client configuration.""" from databricks.sql.auth.common import ClientContext diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py index 713342b2e..687bdd391 100644 --- a/tests/unit/test_util.py +++ b/tests/unit/test_util.py @@ -6,6 +6,7 @@ convert_to_assigned_datatypes_in_column_table, ColumnTable, concat_table_chunks, + serialize_query_tags, ) try: @@ -161,3 +162,65 @@ def test_concat_table_chunks__incorrect_column_names_error(self): with pytest.raises(ValueError): concat_table_chunks([column_table1, column_table2]) + + def test_serialize_query_tags_basic(self): + """Test basic query tags serialization""" + query_tags = {"team": "data-eng", "application": "etl"} + result = serialize_query_tags(query_tags) + assert result == "team:data-eng,application:etl" + + def test_serialize_query_tags_with_none_value(self): + """Test query tags with None value (should omit colon and value)""" + query_tags = {"key1": "value1", "key2": None, "key3": "value3"} + result = serialize_query_tags(query_tags) + assert result == "key1:value1,key2,key3:value3" + + def test_serialize_query_tags_with_special_chars(self): + """Test query tags with special characters (colon, comma, backslash)""" + query_tags = { + "key1": "value:with:colons", + "key2": "value,with,commas", + "key3": r"value\with\backslashes", + } + result = serialize_query_tags(query_tags) + assert ( + result + == r"key1:value\:with\:colons,key2:value\,with\,commas,key3:value\\with\\backslashes" + ) + + def test_serialize_query_tags_with_mixed_special_chars(self): + """Test query tags with mixed special characters""" + query_tags = {"key1": r"a:b,c\d"} + result = serialize_query_tags(query_tags) + assert result == r"key1:a\:b\,c\\d" + + def test_serialize_query_tags_empty_dict(self): + """Test serialization with empty dictionary""" + query_tags = {} + result = serialize_query_tags(query_tags) + assert result is None + + def test_serialize_query_tags_none(self): + """Test serialization with None input""" + result = serialize_query_tags(None) + assert result is None + + def test_serialize_query_tags_with_special_chars_in_key(self): + """Test query tags with special characters in keys (only backslashes are escaped in keys)""" + query_tags = { + "key:with:colons": "value1", + "key,with,commas": "value2", + r"key\with\backslashes": "value3", + } + result = serialize_query_tags(query_tags) + # Only backslashes are escaped in keys; colons and commas in keys are not escaped + assert ( + result + == r"key:with:colons:value1,key,with,commas:value2,key\\with\\backslashes:value3" + ) + + def test_serialize_query_tags_all_none_values(self): + """Test query tags where all values are None""" + query_tags = {"key1": None, "key2": None, "key3": None} + result = serialize_query_tags(query_tags) + assert result == "key1,key2,key3" From e916f716521ea16d4087d2d909b3e4f1a896f05f Mon Sep 17 00:00:00 2001 From: Jiabin Hu Date: Sun, 8 Mar 2026 23:20:30 -0700 Subject: [PATCH 09/24] [QI-3367] Allow specifiying query tags as a dict upon connection creation (#749) * Allow specifiying query tags as a dict upon connection creation Signed-off-by: Jiabin Hu * fix comment Signed-off-by: Jiabin Hu --------- Signed-off-by: Jiabin Hu --- examples/query_tags_example.py | 15 ++++++--------- src/databricks/sql/client.py | 11 +++++++++++ tests/unit/test_session.py | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/examples/query_tags_example.py b/examples/query_tags_example.py index 687ce4140..977dc6ad5 100644 --- a/examples/query_tags_example.py +++ b/examples/query_tags_example.py @@ -8,7 +8,7 @@ in the system.query.history table for analytical purposes. There are two ways to set query tags: -1. Session-level: Set in session_configuration (applies to all queries in the session) +1. Connection-level: Pass query_tags parameter to sql.connect() (applies to all queries in the session) 2. Per-query level: Pass query_tags parameter to execute() or execute_async() (applies to specific query) Format: Dictionary with string keys and optional string values @@ -17,21 +17,18 @@ Special cases: - If a value is None, only the key is included (no colon or value) - Special characters (comma, colon and backslash) in values are automatically escaped -- Keys are not escaped (should be controlled identifiers) +- Backslashes in keys are automatically escaped; other special characters in keys are not allowed """ print("=== Query Tags Example ===\n") -# Example 1: Session-level query tags (old approach) -print("Example 1: Session-level query tags") +# Example 1: Connection-level query tags +print("Example 1: Connection-level query tags") with sql.connect( server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"), http_path=os.getenv("DATABRICKS_HTTP_PATH"), access_token=os.getenv("DATABRICKS_TOKEN"), - session_configuration={ - 'QUERY_TAGS': 'team:engineering,test:query-tags', - 'ansi_mode': False - } + query_tags={"team": "engineering", "application": "etl"}, ) as connection: with connection.cursor() as cursor: @@ -41,7 +38,7 @@ print() -# Example 2: Per-query query tags (new approach) +# Example 2: Per-query query tags print("Example 2: Per-query query tags") with sql.connect( server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"), diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py index efaf6ae4d..2aeea175e 100755 --- a/src/databricks/sql/client.py +++ b/src/databricks/sql/client.py @@ -36,6 +36,7 @@ ColumnQueue, build_client_context, get_session_config_value, + serialize_query_tags, ) from databricks.sql.parameters.native import ( DbsqlParameterBase, @@ -106,6 +107,7 @@ def __init__( schema: Optional[str] = None, _use_arrow_native_complex_types: Optional[bool] = True, ignore_transactions: bool = True, + query_tags: Optional[Dict[str, Optional[str]]] = None, **kwargs, ) -> None: """ @@ -281,6 +283,15 @@ def read(self) -> Optional[OAuthToken]: "spark.sql.thriftserver.metadata.metricview.enabled" ] = "true" + if query_tags is not None: + if session_configuration is None: + session_configuration = {} + serialized = serialize_query_tags(query_tags) + if serialized: + session_configuration["QUERY_TAGS"] = serialized + else: + session_configuration.pop("QUERY_TAGS", None) + self.disable_pandas = kwargs.get("_disable_pandas", False) self.lz4_compression = kwargs.get("enable_query_result_lz4_compression", True) self.use_cloud_fetch = kwargs.get("use_cloud_fetch", True) diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 1d70ec4c4..3a43c1a75 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -202,3 +202,24 @@ def test_finalizer_closes_abandoned_connection(self, mock_client_class): close_session_call_args = instance.close_session.call_args[0][0] assert close_session_call_args.guid == b"\x22" assert close_session_call_args.secret == b"\x33" + + @patch("%s.session.ThriftDatabricksClient" % PACKAGE_NAME) + def test_query_tags_dict_sets_session_config(self, mock_client_class): + databricks.sql.connect( + query_tags={"team": "data-eng", "project": "etl"}, + **self.DUMMY_CONNECTION_ARGS, + ) + + call_kwargs = mock_client_class.return_value.open_session.call_args[1] + assert call_kwargs["session_configuration"]["QUERY_TAGS"] == "team:data-eng,project:etl" + + @patch("%s.session.ThriftDatabricksClient" % PACKAGE_NAME) + def test_query_tags_dict_takes_precedence_over_session_config(self, mock_client_class): + databricks.sql.connect( + query_tags={"team": "new-team"}, + session_configuration={"QUERY_TAGS": "team:old-team,other:value"}, + **self.DUMMY_CONNECTION_ARGS, + ) + + call_kwargs = mock_client_class.return_value.open_session.call_args[1] + assert call_kwargs["session_configuration"]["QUERY_TAGS"] == "team:new-team" From 12bfd5b7a86408da7ebbb61513dca72f90c50c4b Mon Sep 17 00:00:00 2001 From: Shubhambhusate <99666676+Shubhambhusate@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:29:20 +0530 Subject: [PATCH 10/24] =?UTF-8?q?Fix=20float=20inference=20to=20use=20Doub?= =?UTF-8?q?leParameter=20(64-bit)=20instead=20of=20FloatP=E2=80=A6=20(#742?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix float inference to use DoubleParameter (64-bit) instead of FloatParameter (32-bit) Signed-off-by: Shubhambhusate * Add DoubleParameter with Primitive.DOUBLE to test_inference coverage --------- Signed-off-by: Shubhambhusate --- src/databricks/sql/parameters/native.py | 2 +- tests/unit/test_parameters.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/databricks/sql/parameters/native.py b/src/databricks/sql/parameters/native.py index b7c448254..d0fb8d82c 100644 --- a/src/databricks/sql/parameters/native.py +++ b/src/databricks/sql/parameters/native.py @@ -659,7 +659,7 @@ def dbsql_parameter_from_primitive( elif isinstance(value, str): return StringParameter(value=value, name=name) elif isinstance(value, float): - return FloatParameter(value=value, name=name) + return DoubleParameter(value=value, name=name) elif isinstance(value, datetime.datetime): return TimestampParameter(value=value, name=name) elif isinstance(value, datetime.date): diff --git a/tests/unit/test_parameters.py b/tests/unit/test_parameters.py index cf2e24951..0588eb499 100644 --- a/tests/unit/test_parameters.py +++ b/tests/unit/test_parameters.py @@ -295,7 +295,8 @@ def test_tspark_param_ordinal(self): (BigIntegerParameter, Primitive.BIGINT), (BooleanParameter, Primitive.BOOL), (DateParameter, Primitive.DATE), - (FloatParameter, Primitive.FLOAT), + (DoubleParameter, Primitive.DOUBLE), + (DoubleParameter, Primitive.FLOAT), (VoidParameter, Primitive.NONE), (TimestampParameter, Primitive.TIMESTAMP), (MapParameter, Primitive.MAP), @@ -305,7 +306,7 @@ def test_tspark_param_ordinal(self): def test_inference(self, _type: TDbsqlParameter, prim: Primitive): """This method only tests inferrable types. - Not tested are TinyIntParameter, SmallIntParameter DoubleParameter and TimestampNTZParameter + Not tested are TinyIntParameter, SmallIntParameter, FloatParameter and TimestampNTZParameter """ inferred_type = dbsql_parameter_from_primitive(prim.value) From 36fb3760b9e1eec552376004ec9494fe6b425ff1 Mon Sep 17 00:00:00 2001 From: Jothi Prakash Date: Tue, 10 Mar 2026 16:29:57 +0530 Subject: [PATCH 11/24] Updated the PyArrow concatenation of tables to use promote_options as default (#751) Updated pyarrow-concat --- src/databricks/sql/utils.py | 2 +- tests/unit/test_cloud_fetch_queue.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index 125edbbaa..b1fff7202 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -895,7 +895,7 @@ def concat_table_chunks( result_table[j].extend(table_chunks[i].column_table[j]) return ColumnTable(result_table, table_chunks[0].column_names) else: - return pyarrow.concat_tables(table_chunks) + return pyarrow.concat_tables(table_chunks, promote_options="default") def serialize_query_tags( diff --git a/tests/unit/test_cloud_fetch_queue.py b/tests/unit/test_cloud_fetch_queue.py index 0c3fc7103..97bb99ad9 100644 --- a/tests/unit/test_cloud_fetch_queue.py +++ b/tests/unit/test_cloud_fetch_queue.py @@ -174,7 +174,7 @@ def test_next_n_rows_more_than_one_table(self, mock_create_next_table): assert ( result == pyarrow.concat_tables( - [self.make_arrow_table(), self.make_arrow_table()] + [self.make_arrow_table(), self.make_arrow_table()],promote_options="default" )[:7] ) @@ -266,7 +266,7 @@ def test_remaining_rows_multiple_tables_fully_returned( assert ( result == pyarrow.concat_tables( - [self.make_arrow_table(), self.make_arrow_table()] + [self.make_arrow_table(), self.make_arrow_table()], promote_options="default" )[3:] ) From ca4d7bcbcd6846bd9be1b079dcd84ce252e525b2 Mon Sep 17 00:00:00 2001 From: Sreekanth Date: Mon, 16 Mar 2026 14:50:33 +0530 Subject: [PATCH 12/24] Add statement-level query_tags support for SEA backend (#754) * Add statement-level query_tags support for SEA backend Signed-off-by: Sreekanth Vadigi * Simplify None handling in query_tags serialization Signed-off-by: Sreekanth Vadigi --------- Signed-off-by: Sreekanth Vadigi --- src/databricks/sql/backend/sea/backend.py | 5 +- .../sql/backend/sea/models/requests.py | 8 ++ tests/unit/test_sea_backend.py | 110 +++++++++++++++++- 3 files changed, 118 insertions(+), 5 deletions(-) diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py index a6cff4913..ff130cd39 100644 --- a/src/databricks/sql/backend/sea/backend.py +++ b/src/databricks/sql/backend/sea/backend.py @@ -463,9 +463,7 @@ def execute_command( async_op: bool, enforce_embedded_schema_correctness: bool, row_limit: Optional[int] = None, - query_tags: Optional[ - Dict[str, Optional[str]] - ] = None, # TODO: implement query_tags for SEA backend + query_tags: Optional[Dict[str, Optional[str]]] = None, ) -> Union[SeaResultSet, None]: """ Execute a SQL command using the SEA backend. @@ -532,6 +530,7 @@ def execute_command( row_limit=row_limit, parameters=sea_parameters if sea_parameters else None, result_compression=result_compression, + query_tags=query_tags, ) response_data = self._http_client._make_request( diff --git a/src/databricks/sql/backend/sea/models/requests.py b/src/databricks/sql/backend/sea/models/requests.py index ad046ff54..eb156fb1a 100644 --- a/src/databricks/sql/backend/sea/models/requests.py +++ b/src/databricks/sql/backend/sea/models/requests.py @@ -31,6 +31,7 @@ class ExecuteStatementRequest: wait_timeout: str = "10s" on_wait_timeout: str = "CONTINUE" row_limit: Optional[int] = None + query_tags: Optional[Dict[str, Optional[str]]] = None def to_dict(self) -> Dict[str, Any]: """Convert the request to a dictionary for JSON serialization.""" @@ -60,6 +61,13 @@ def to_dict(self) -> Dict[str, Any]: for param in self.parameters ] + # SEA API expects query_tags as an array of {key, value} objects. + # None/empty values are left to the server to handle as key-only tags. + if self.query_tags: + result["query_tags"] = [ + {"key": k, "value": v} for k, v in self.query_tags.items() + ] + return result diff --git a/tests/unit/test_sea_backend.py b/tests/unit/test_sea_backend.py index 26a898cb8..f71e60943 100644 --- a/tests/unit/test_sea_backend.py +++ b/tests/unit/test_sea_backend.py @@ -185,7 +185,7 @@ def test_session_management(self, sea_client, mock_http_client, thrift_session_i session_config = { "ANSI_MODE": "FALSE", # Supported parameter "STATEMENT_TIMEOUT": "3600", # Supported parameter - "QUERY_TAGS": "team:marketing,dashboard:abc123", # Supported parameter + "QUERY_TAGS": "team:marketing,dashboard:abc123", # Supported parameter "unsupported_param": "value", # Unsupported parameter } catalog = "test_catalog" @@ -197,7 +197,7 @@ def test_session_management(self, sea_client, mock_http_client, thrift_session_i "session_confs": { "ansi_mode": "FALSE", "statement_timeout": "3600", - "query_tags": "team:marketing,dashboard:abc123", + "query_tags": "team:marketing,dashboard:abc123", }, "catalog": catalog, "schema": schema, @@ -416,6 +416,112 @@ def test_command_execution_advanced( ) assert "Command failed" in str(excinfo.value) + def _execute_response(self): + return { + "statement_id": "test-statement-123", + "status": {"state": "SUCCEEDED"}, + "manifest": {"schema": [], "total_row_count": 0, "total_byte_count": 0}, + "result": {"data": []}, + } + + def _run_execute_command(self, sea_client, sea_session_id, mock_cursor, **kwargs): + """Helper to invoke execute_command with default args.""" + return sea_client.execute_command( + operation="SELECT 1", + session_id=sea_session_id, + max_rows=100, + max_bytes=1000, + lz4_compression=False, + cursor=mock_cursor, + use_cloud_fetch=False, + parameters=[], + async_op=False, + enforce_embedded_schema_correctness=False, + **kwargs, + ) + + def test_execute_command_query_tags_string_values( + self, sea_client, mock_http_client, mock_cursor, sea_session_id + ): + """query_tags with string values are included in the request payload.""" + mock_http_client._make_request.return_value = self._execute_response() + with patch.object(sea_client, "_response_to_result_set"): + self._run_execute_command( + sea_client, + sea_session_id, + mock_cursor, + query_tags={"env": "prod", "team": "data"}, + ) + _, kwargs = mock_http_client._make_request.call_args + assert kwargs["data"]["query_tags"] == [ + {"key": "env", "value": "prod"}, + {"key": "team", "value": "data"}, + ] + + def test_execute_command_query_tags_none_value( + self, sea_client, mock_http_client, mock_cursor, sea_session_id + ): + """query_tags with a None value omit the value field (key-only tag).""" + mock_http_client._make_request.return_value = self._execute_response() + with patch.object(sea_client, "_response_to_result_set"): + self._run_execute_command( + sea_client, + sea_session_id, + mock_cursor, + query_tags={"env": "prod", "team": None}, + ) + _, kwargs = mock_http_client._make_request.call_args + assert kwargs["data"]["query_tags"] == [ + {"key": "env", "value": "prod"}, + {"key": "team", "value": None}, + ] + + def test_execute_command_no_query_tags_omitted( + self, sea_client, mock_http_client, mock_cursor, sea_session_id + ): + """query_tags field is absent from the request when not provided.""" + mock_http_client._make_request.return_value = self._execute_response() + with patch.object(sea_client, "_response_to_result_set"): + self._run_execute_command(sea_client, sea_session_id, mock_cursor) + _, kwargs = mock_http_client._make_request.call_args + assert "query_tags" not in kwargs["data"] + + def test_execute_command_empty_query_tags_omitted( + self, sea_client, mock_http_client, mock_cursor, sea_session_id + ): + """Empty query_tags dict is treated as absent — field omitted from request.""" + mock_http_client._make_request.return_value = self._execute_response() + with patch.object(sea_client, "_response_to_result_set"): + self._run_execute_command( + sea_client, sea_session_id, mock_cursor, query_tags={} + ) + _, kwargs = mock_http_client._make_request.call_args + assert "query_tags" not in kwargs["data"] + + def test_execute_command_async_query_tags( + self, sea_client, mock_http_client, mock_cursor, sea_session_id + ): + """query_tags are included in async execute requests (execute_async path).""" + mock_http_client._make_request.return_value = { + "statement_id": "test-statement-async", + "status": {"state": "PENDING"}, + } + sea_client.execute_command( + operation="SELECT 1", + session_id=sea_session_id, + max_rows=100, + max_bytes=1000, + lz4_compression=False, + cursor=mock_cursor, + use_cloud_fetch=False, + parameters=[], + async_op=True, + enforce_embedded_schema_correctness=False, + query_tags={"job": "nightly-etl"}, + ) + _, kwargs = mock_http_client._make_request.call_args + assert kwargs["data"]["query_tags"] == [{"key": "job", "value": "nightly-etl"}] + def test_command_management( self, sea_client, From 330c4454d2425a259f600b2126e01b9ce408274d Mon Sep 17 00:00:00 2001 From: Jothi Prakash Date: Fri, 27 Mar 2026 15:14:42 +0530 Subject: [PATCH 13/24] Harden CI/CD workflows: fix secret exposure, script injection, and pin all actions to SHA (#762) Updaed the security --- .github/workflows/code-coverage.yml | 21 ++++++++------ .github/workflows/code-quality-checks.yml | 35 ++++++++++++----------- .github/workflows/daily-telemetry-e2e.yml | 13 +++++---- .github/workflows/dco-check.yml | 12 ++++---- .github/workflows/integration.yml | 21 ++++++++------ .github/workflows/publish-manual.yml | 9 ++++-- .github/workflows/publish-test.yml | 19 ++++++++---- .github/workflows/publish.yml | 14 +++++---- 8 files changed, 86 insertions(+), 58 deletions(-) diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 5c961757e..9250d0ca5 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -20,14 +20,12 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref || github.ref_name }} - repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} - name: Set up python id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.10" #---------------------------------------------- @@ -41,7 +39,7 @@ jobs: # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -53,7 +51,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -105,8 +103,10 @@ jobs: #---------------------------------------------- - name: Check for coverage override id: override + env: + PR_BODY: ${{ github.event.pull_request.body }} run: | - OVERRIDE_COMMENT=$(echo "${{ github.event.pull_request.body }}" | grep -E "SKIP_COVERAGE_CHECK\s*=" || echo "") + OVERRIDE_COMMENT=$(echo "$PR_BODY" | grep -E "SKIP_COVERAGE_CHECK\s*=" || echo "") if [ -n "$OVERRIDE_COMMENT" ]; then echo "override=true" >> $GITHUB_OUTPUT REASON=$(echo "$OVERRIDE_COMMENT" | sed -E 's/.*SKIP_COVERAGE_CHECK\s*=\s*(.+)/\1/') @@ -153,9 +153,12 @@ jobs: # coverage enforcement summary #---------------------------------------------- - name: Coverage enforcement summary + env: + OVERRIDE: ${{ steps.override.outputs.override }} + REASON: ${{ steps.override.outputs.reason }} run: | - if [ "${{ steps.override.outputs.override }}" == "true" ]; then - echo "⚠️ Coverage checks bypassed: ${{ steps.override.outputs.reason }}" + if [ "$OVERRIDE" == "true" ]; then + echo "⚠️ Coverage checks bypassed: $REASON" echo "Please ensure this override is justified and temporary" else echo "✅ Coverage checks enforced - minimum 85% required" diff --git a/.github/workflows/code-quality-checks.yml b/.github/workflows/code-quality-checks.yml index cc3952920..13a889c56 100644 --- a/.github/workflows/code-quality-checks.yml +++ b/.github/workflows/code-quality-checks.yml @@ -2,6 +2,9 @@ name: Code Quality Checks on: [pull_request] +permissions: + contents: read + jobs: run-unit-tests: runs-on: ubuntu-latest @@ -23,17 +26,17 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python ${{ matrix.python-version }} id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: ${{ matrix.python-version }} #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -45,7 +48,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ matrix.dependency-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -107,17 +110,17 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v2 + uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2 - name: Set up python ${{ matrix.python-version }} id: setup-python - uses: actions/setup-python@v2 + uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e # v2 with: python-version: ${{ matrix.python-version }} #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -129,7 +132,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv-pyarrow key: venv-pyarrow-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ matrix.dependency-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -185,17 +188,17 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python ${{ matrix.python-version }} id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: ${{ matrix.python-version }} #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -207,7 +210,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -238,17 +241,17 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python ${{ matrix.python-version }} id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: ${{ matrix.python-version }} #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -260,7 +263,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} diff --git a/.github/workflows/daily-telemetry-e2e.yml b/.github/workflows/daily-telemetry-e2e.yml index d60b7f5a9..c3d0da5df 100644 --- a/.github/workflows/daily-telemetry-e2e.yml +++ b/.github/workflows/daily-telemetry-e2e.yml @@ -12,6 +12,9 @@ on: default: 'tests/e2e/test_telemetry_e2e.py' type: string +permissions: + contents: read + jobs: telemetry-e2e-tests: runs-on: ubuntu-latest @@ -29,11 +32,11 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.10" @@ -41,7 +44,7 @@ jobs: # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -53,7 +56,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -82,7 +85,7 @@ jobs: #---------------------------------------------- - name: Upload test results on failure if: failure() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: telemetry-test-results path: | diff --git a/.github/workflows/dco-check.yml b/.github/workflows/dco-check.yml index 050665ec9..770ffb9ec 100644 --- a/.github/workflows/dco-check.yml +++ b/.github/workflows/dco-check.yml @@ -2,17 +2,19 @@ name: DCO Check on: [pull_request] +permissions: + contents: read + pull-requests: write + jobs: check: - runs-on: - group: databricks-protected-runner-group - labels: linux-ubuntu-latest + runs-on: ubuntu-latest steps: - name: Check for DCO id: dco-check - uses: tisonkun/actions-dco@v1.1 + uses: tisonkun/actions-dco@6d1f8a197db1b04df1769707b46b9366b1eca902 # v1.1 - name: Comment about DCO status - uses: actions/github-script@v7 + uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7 if: ${{ failure() }} with: script: | diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index c915ee6c1..49dedfd91 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -4,7 +4,10 @@ on: push: branches: - main - pull_request: + pull_request: + +permissions: + contents: read jobs: run-non-telemetry-tests: @@ -21,17 +24,17 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.10" #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -43,7 +46,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -79,10 +82,10 @@ jobs: DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} steps: - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.10" - name: Install system dependencies @@ -90,14 +93,14 @@ jobs: sudo apt-get update sudo apt-get install -y libkrb5-dev - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: virtualenvs-create: true virtualenvs-in-project: true installer-parallel: true - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} diff --git a/.github/workflows/publish-manual.yml b/.github/workflows/publish-manual.yml index 2f2a7a4dd..1802664a0 100644 --- a/.github/workflows/publish-manual.yml +++ b/.github/workflows/publish-manual.yml @@ -4,6 +4,9 @@ name: Publish to PyPI Manual [Production] on: workflow_dispatch: {} +permissions: + contents: read + jobs: publish: name: Publish @@ -14,14 +17,14 @@ jobs: # Step 1: Check out the repository code #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v2 # Check out the repository to access the code + uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2 #---------------------------------------------- # Step 2: Set up Python environment #---------------------------------------------- - name: Set up python id: setup-python - uses: actions/setup-python@v2 + uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e # v2 with: python-version: 3.9 # Specify the Python version to be used @@ -29,7 +32,7 @@ jobs: # Step 3: Install and configure Poetry #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 # Install Poetry, the Python package manager + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml index 97a444e68..e967b8085 100644 --- a/.github/workflows/publish-test.yml +++ b/.github/workflows/publish-test.yml @@ -1,5 +1,12 @@ name: Publish to PyPI [Test] -on: [push] +on: + push: + branches: + - main + +permissions: + contents: read + jobs: test-pypi: name: Create patch version number and push to test-pypi @@ -9,17 +16,17 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.10" #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -30,7 +37,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -48,7 +55,7 @@ jobs: # Get the current version and increment it (test-pypi requires a unique version number) #---------------------------------------------- - name: Get next version - uses: reecetech/version-increment@2022.2.4 + uses: reecetech/version-increment@ddbbe72b7f76a996076fabfdce21a16384e8644a # 2022.2.4 id: version with: scheme: semver diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b101f421c..0fd04d992 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -2,6 +2,10 @@ name: Publish to PyPI [Production] on: release: types: [published] + +permissions: + contents: read + jobs: publish: name: Publish @@ -11,17 +15,17 @@ jobs: # check-out repo and set-up python #---------------------------------------------- - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Set up python id: setup-python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: 3.9 #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- - name: Install Poetry - uses: snok/install-poetry@v1 + uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 with: version: "2.2.1" virtualenvs-create: true @@ -32,7 +36,7 @@ jobs: #---------------------------------------------- - name: Load cached venv id: cached-poetry-dependencies - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} @@ -50,7 +54,7 @@ jobs: # Here we use version-increment to fetch the latest tagged version (we won't increment it though) #------------------------------------------------------------------------------------------------ - name: Get next version - uses: reecetech/version-increment@2022.2.4 + uses: reecetech/version-increment@ddbbe72b7f76a996076fabfdce21a16384e8644a # 2022.2.4 id: version with: scheme: semver From 47933533071c11141b3e94248863644aa1735989 Mon Sep 17 00:00:00 2001 From: Jothi Prakash Date: Mon, 30 Mar 2026 09:59:34 +0530 Subject: [PATCH 14/24] Removed Publish workflow (#764) --- .github/workflows/publish-manual.yml | 90 ---------------------------- .github/workflows/publish-test.yml | 82 ------------------------- .github/workflows/publish.yml | 76 ----------------------- 3 files changed, 248 deletions(-) delete mode 100644 .github/workflows/publish-manual.yml delete mode 100644 .github/workflows/publish-test.yml delete mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish-manual.yml b/.github/workflows/publish-manual.yml deleted file mode 100644 index 1802664a0..000000000 --- a/.github/workflows/publish-manual.yml +++ /dev/null @@ -1,90 +0,0 @@ -name: Publish to PyPI Manual [Production] - -# Allow manual triggering of the workflow -on: - workflow_dispatch: {} - -permissions: - contents: read - -jobs: - publish: - name: Publish - runs-on: ubuntu-latest - - steps: - #---------------------------------------------- - # Step 1: Check out the repository code - #---------------------------------------------- - - name: Check out repository - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2 - - #---------------------------------------------- - # Step 2: Set up Python environment - #---------------------------------------------- - - name: Set up python - id: setup-python - uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e # v2 - with: - python-version: 3.9 # Specify the Python version to be used - - #---------------------------------------------- - # Step 3: Install and configure Poetry - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # Step 3.5: Install Kerberos system dependencies - #---------------------------------------------- - - name: Install Kerberos system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - -# #---------------------------------------------- -# # Step 4: Load cached virtual environment (if available) -# #---------------------------------------------- -# - name: Load cached venv -# id: cached-poetry-dependencies -# uses: actions/cache@v2 -# with: -# path: .venv # Path to the virtual environment -# key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} -# # Cache key is generated based on OS, Python version, repo name, and the `poetry.lock` file hash - -# #---------------------------------------------- -# # Step 5: Install dependencies if the cache is not found -# #---------------------------------------------- -# - name: Install dependencies -# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' # Only run if the cache was not hit -# run: poetry install --no-interaction --no-root # Install dependencies without interaction - -# #---------------------------------------------- -# # Step 6: Update the version to the manually provided version -# #---------------------------------------------- -# - name: Update pyproject.toml with the specified version -# run: poetry version ${{ github.event.inputs.version }} # Use the version provided by the user input - - #---------------------------------------------- - # Step 7: Build and publish the first package to PyPI - #---------------------------------------------- - - name: Build and publish databricks sql connector to PyPI - working-directory: ./databricks_sql_connector - run: | - poetry build - poetry publish -u __token__ -p ${{ secrets.PROD_PYPI_TOKEN }} # Publish with PyPI token - #---------------------------------------------- - # Step 7: Build and publish the second package to PyPI - #---------------------------------------------- - - - name: Build and publish databricks sql connector core to PyPI - working-directory: ./databricks_sql_connector_core - run: | - poetry build - poetry publish -u __token__ -p ${{ secrets.PROD_PYPI_TOKEN }} # Publish with PyPI token \ No newline at end of file diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml deleted file mode 100644 index e967b8085..000000000 --- a/.github/workflows/publish-test.yml +++ /dev/null @@ -1,82 +0,0 @@ -name: Publish to PyPI [Test] -on: - push: - branches: - - main - -permissions: - contents: read - -jobs: - test-pypi: - name: Create patch version number and push to test-pypi - runs-on: ubuntu-latest - steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - - name: Check out repository - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Set up python - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 - with: - python-version: "3.10" - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install Kerberos system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #---------------------------------------------- - # Get the current version and increment it (test-pypi requires a unique version number) - #---------------------------------------------- - - name: Get next version - uses: reecetech/version-increment@ddbbe72b7f76a996076fabfdce21a16384e8644a # 2022.2.4 - id: version - with: - scheme: semver - increment: patch - #---------------------------------------------- - # Tell poetry to update the version number - #---------------------------------------------- - - name: Update pyproject.toml - run: poetry version ${{ steps.version.outputs.major-version }}.${{ steps.version.outputs.minor-version }}.dev$(date +%s) - #---------------------------------------------- - # Build the package (before publish action) - #---------------------------------------------- - - name: Build package - run: poetry build - #---------------------------------------------- - # Configure test-pypi repository - #---------------------------------------------- - - name: Configure test-pypi repository - run: poetry config repositories.testpypi https://test.pypi.org/legacy/ - #---------------------------------------------- - # Attempt push to test-pypi - #---------------------------------------------- - - name: Publish to test-pypi - run: poetry publish --username __token__ --password ${{ secrets.TEST_PYPI_TOKEN }} --repository testpypi diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 0fd04d992..000000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,76 +0,0 @@ -name: Publish to PyPI [Production] -on: - release: - types: [published] - -permissions: - contents: read - -jobs: - publish: - name: Publish - runs-on: ubuntu-latest - steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - - name: Check out repository - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Set up python - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 - with: - python-version: 3.9 - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install Kerberos system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #------------------------------------------------------------------------------------------------ - # Here we use version-increment to fetch the latest tagged version (we won't increment it though) - #------------------------------------------------------------------------------------------------ - - name: Get next version - uses: reecetech/version-increment@ddbbe72b7f76a996076fabfdce21a16384e8644a # 2022.2.4 - id: version - with: - scheme: semver - increment: patch - #----------------------------------------------------------------------------- - # Tell poetry to use the `current-version` that was found by the previous step - #----------------------------------------------------------------------------- - - name: Update pyproject.toml - run: poetry version ${{ steps.version.outputs.current-version }} - #---------------------------------------------- - # Build the package (before publish) - #---------------------------------------------- - - name: Build package - run: poetry build - #---------------------------------------------- - # Publish to pypi - #---------------------------------------------- - - name: Publish to pypi - run: poetry publish --username __token__ --password ${{ secrets.PROD_PYPI_TOKEN }} From e056275e4e88879749517a4607a54e05972dadc3 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Mon, 13 Apr 2026 17:33:31 +0530 Subject: [PATCH 15/24] Replace third-party DCO action with custom script (#769) The tisonkun/actions-dco action has been unreliable. Replace it with an inline bash script (matching databricks-sql-go) that checks each commit for a Signed-off-by line, provides clear per-commit feedback, and scopes the trigger to opened/synchronize/reopened events on main. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- .github/workflows/dco-check.yml | 81 +++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/.github/workflows/dco-check.yml b/.github/workflows/dco-check.yml index 770ffb9ec..b504ad84b 100644 --- a/.github/workflows/dco-check.yml +++ b/.github/workflows/dco-check.yml @@ -1,29 +1,72 @@ name: DCO Check -on: [pull_request] +on: + pull_request: + types: [opened, synchronize, reopened] + branches: [main] permissions: contents: read - pull-requests: write jobs: - check: + dco-check: runs-on: ubuntu-latest + name: Check DCO Sign-off steps: - - name: Check for DCO - id: dco-check - uses: tisonkun/actions-dco@6d1f8a197db1b04df1769707b46b9366b1eca902 # v1.1 - - name: Comment about DCO status - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7 - if: ${{ failure() }} + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: - script: | - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: `Thanks for your contribution! To satisfy the DCO policy in our \ - [contributing guide](https://github.com/databricks/databricks-sql-python/blob/main/CONTRIBUTING.md) \ - every commit message must include a sign-off message. One or more of your commits is missing this message. \ - You can reword previous commit messages with an interactive rebase (\`git rebase -i main\`).` - }) + fetch-depth: 0 + + - name: Check DCO Sign-off + env: + BASE_SHA: ${{ github.event.pull_request.base.sha }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + #!/bin/bash + set -e + + echo "Checking commits from $BASE_SHA to $HEAD_SHA" + + COMMITS=$(git rev-list --no-merges "$BASE_SHA..$HEAD_SHA") + + if [ -z "$COMMITS" ]; then + echo "No commits found in this PR" + exit 0 + fi + + FAILED_COMMITS=() + + for commit in $COMMITS; do + echo "Checking commit: $commit" + COMMIT_MSG=$(git log --format=%B -n 1 "$commit") + if echo "$COMMIT_MSG" | grep -q "^Signed-off-by: "; then + echo " Commit $commit has DCO sign-off" + else + echo " Commit $commit is missing DCO sign-off" + FAILED_COMMITS+=("$commit") + fi + done + + if [ ${#FAILED_COMMITS[@]} -ne 0 ]; then + echo "" + echo "DCO Check Failed!" + echo "The following commits are missing the required 'Signed-off-by' line:" + for commit in "${FAILED_COMMITS[@]}"; do + echo " - $commit: $(git log --format=%s -n 1 "$commit")" + done + echo "" + echo "To fix this, you need to sign off your commits. You can:" + echo "1. Add sign-off to new commits: git commit -s -m 'Your commit message'" + echo "2. Amend existing commits: git commit --amend --signoff" + echo "3. For multiple commits, use: git rebase --signoff HEAD~N (where N is the number of commits)" + echo "" + echo "The sign-off should be in the format:" + echo "Signed-off-by: Your Name " + echo "" + echo "For more details, see CONTRIBUTING.md" + exit 1 + else + echo "" + echo "All commits have proper DCO sign-off!" + fi From fbdcd32307ae65aa8a404954b8e132eed9f418a1 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Mon, 13 Apr 2026 18:28:47 +0530 Subject: [PATCH 16/24] Migrate CI to protected runners and JFrog PyPI proxy (#770) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate CI to databricks-protected runners and route PyPI through JFrog Protected runners are required for Databricks OSS repos. Add a setup-jfrog composite action (OIDC-based, matching databricks-odbc) that sets PIP_INDEX_URL so all pip/poetry installs go through the JFrog PyPI proxy. Every workflow now runs on the databricks-protected-runner-group with id-token: write for the OIDC exchange. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Add Poetry JFrog source configuration to all workflows The previous commit only set PIP_INDEX_URL, but Poetry uses its own resolver and needs explicit source configuration. Add a "Configure Poetry for JFrog" step after poetry install in every job that sets up the JFrog repository and credentials, then adds it as the primary source for the project. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Fix step ordering: move JFrog setup after poetry install The snok/install-poetry action uses pip internally to install poetry. When PIP_INDEX_URL was set before this step, the installer tried to route through JFrog and failed with an SSL error. Move the JFrog OIDC token + PIP_INDEX_URL + poetry source configuration to run after Install Poetry but before poetry install. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Replace snok/install-poetry with pip install through JFrog The hardened runners block direct access to install.python-poetry.org, causing snok/install-poetry to fail with SSL errors. Replace it with `pip install poetry==2.2.1` which routes through the JFrog PyPI proxy. New step ordering: checkout → setup-python → Setup JFrog (OIDC + PIP_INDEX_URL) → pip install poetry → Configure Poetry for JFrog → poetry install. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Add poetry lock --no-update after source add to fix lock mismatch poetry source add modifies pyproject.toml, which makes poetry refuse to install from the existing lock file. Running poetry lock --no-update regenerates the lock file metadata without changing dependency versions. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Fix poetry lock flag and YAML indentation Poetry 2.x doesn't have --no-update flag, use poetry lock instead. Also fix indentation of poetry lock in the arrow test job. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Move JFrog setup before setup-python, matching sqlalchemy pattern Follow the proven pattern from databricks/databricks-sqlalchemy#59: checkout → Setup JFrog → setup-python → pip install poetry → poetry source add + poetry lock → poetry install. The hardened runners block pypi.org at the network level, so JFrog must be configured before actions/setup-python (which upgrades pip). Also simplified workflows by removing verbose section comments. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Extract setup-poetry composite action to remove duplication Create .github/actions/setup-poetry that bundles JFrog setup, setup-python, poetry install via pip, JFrog source config, cache, and dependency install into a single reusable action with inputs for python-version, install-args, cache-path, and cache-suffix. All workflows now call setup-poetry instead of repeating these steps, matching the pattern from databricks/databricks-sqlalchemy#59. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --------- Signed-off-by: Vikrant Puppala --- .github/actions/setup-jfrog/action.yml | 32 +++ .github/actions/setup-poetry/action.yml | 63 +++++ .github/workflows/code-coverage.yml | 84 +------ .github/workflows/code-quality-checks.yml | 265 +++++----------------- .github/workflows/daily-telemetry-e2e.yml | 61 +---- .github/workflows/dco-check.yml | 4 +- .github/workflows/integration.yml | 81 ++----- 7 files changed, 196 insertions(+), 394 deletions(-) create mode 100644 .github/actions/setup-jfrog/action.yml create mode 100644 .github/actions/setup-poetry/action.yml diff --git a/.github/actions/setup-jfrog/action.yml b/.github/actions/setup-jfrog/action.yml new file mode 100644 index 000000000..97ae146ba --- /dev/null +++ b/.github/actions/setup-jfrog/action.yml @@ -0,0 +1,32 @@ +name: Setup JFrog OIDC +description: Obtain a JFrog access token via GitHub OIDC and configure pip to use JFrog PyPI proxy + +runs: + using: composite + steps: + - name: Get JFrog OIDC token + shell: bash + run: | + set -euo pipefail + ID_TOKEN=$(curl -sLS \ + -H "User-Agent: actions/oidc-client" \ + -H "Authorization: Bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \ + "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=jfrog-github" | jq .value | tr -d '"') + echo "::add-mask::${ID_TOKEN}" + ACCESS_TOKEN=$(curl -sLS -XPOST -H "Content-Type: application/json" \ + "https://databricks.jfrog.io/access/api/v1/oidc/token" \ + -d "{\"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\", \"subject_token_type\":\"urn:ietf:params:oauth:token-type:id_token\", \"subject_token\": \"${ID_TOKEN}\", \"provider_name\": \"github-actions\"}" | jq .access_token | tr -d '"') + echo "::add-mask::${ACCESS_TOKEN}" + if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then + echo "FAIL: Could not extract JFrog access token" + exit 1 + fi + echo "JFROG_ACCESS_TOKEN=${ACCESS_TOKEN}" >> "$GITHUB_ENV" + echo "JFrog OIDC token obtained successfully" + + - name: Configure pip + shell: bash + run: | + set -euo pipefail + echo "PIP_INDEX_URL=https://gha-service-account:${JFROG_ACCESS_TOKEN}@databricks.jfrog.io/artifactory/api/pypi/db-pypi/simple" >> "$GITHUB_ENV" + echo "pip configured to use JFrog registry" diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml new file mode 100644 index 000000000..f7e15b1c0 --- /dev/null +++ b/.github/actions/setup-poetry/action.yml @@ -0,0 +1,63 @@ +name: Setup Poetry with JFrog +description: Install Poetry, configure JFrog as primary PyPI source, and install project dependencies + +inputs: + python-version: + description: Python version to set up + required: true + install-args: + description: Extra arguments for poetry install (e.g. --all-extras) + required: false + default: "" + cache-path: + description: Path to the virtualenv for caching (e.g. .venv or .venv-pyarrow) + required: false + default: ".venv" + cache-suffix: + description: Extra suffix for the cache key to avoid collisions across job variants + required: false + default: "" + +runs: + using: composite + steps: + - name: Setup JFrog + uses: ./.github/actions/setup-jfrog + + - name: Set up python ${{ inputs.python-version }} + id: setup-python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: ${{ inputs.python-version }} + + - name: Install Poetry + shell: bash + run: | + pip install poetry==2.2.1 + poetry config virtualenvs.create true + poetry config virtualenvs.in-project true + poetry config installer.parallel true + + - name: Configure Poetry JFrog source + shell: bash + run: | + poetry config repositories.jfrog https://databricks.jfrog.io/artifactory/api/pypi/db-pypi/simple + poetry config http-basic.jfrog gha-service-account "${JFROG_ACCESS_TOKEN}" + poetry source add --priority=primary jfrog https://databricks.jfrog.io/artifactory/api/pypi/db-pypi/simple + poetry lock + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 + with: + path: ${{ inputs.cache-path }} + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ inputs.cache-suffix }}${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + shell: bash + run: poetry install --no-interaction --no-root + + - name: Install library + shell: bash + run: poetry install --no-interaction ${{ inputs.install-args }} diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 9250d0ca5..c188c5b3a 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -2,12 +2,15 @@ name: Code Coverage permissions: contents: read + id-token: write on: [pull_request, workflow_dispatch] jobs: test-with-coverage: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest environment: azure-prod env: DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} @@ -16,63 +19,19 @@ jobs: DATABRICKS_CATALOG: peco DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: fetch-depth: 0 - - name: Set up python - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 - with: - python-version: "3.10" - #---------------------------------------------- - # ----- install system dependencies ----- - #---------------------------------------------- - name: Install system dependencies run: | sudo apt-get update sudo apt-get install -y libkrb5-dev - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 + - name: Setup Poetry + uses: ./.github/actions/setup-poetry with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #---------------------------------------------- - # install your root project, if required - #---------------------------------------------- - - name: Install Kerberos system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - - name: Install library - run: poetry install --no-interaction --all-extras - #---------------------------------------------- - # run parallel tests with coverage - #---------------------------------------------- + python-version: "3.10" + install-args: "--all-extras" - name: Run parallel tests with coverage continue-on-error: false run: | @@ -83,24 +42,15 @@ jobs: --cov-report=xml \ --cov-report=term \ -v - - #---------------------------------------------- - # run telemetry tests with coverage (isolated) - #---------------------------------------------- - name: Run telemetry tests with coverage (isolated) continue-on-error: false run: | - # Run test_concurrent_telemetry.py separately for isolation poetry run pytest tests/e2e/test_concurrent_telemetry.py \ --cov=src \ --cov-append \ --cov-report=xml \ --cov-report=term \ -v - - #---------------------------------------------- - # check for coverage override - #---------------------------------------------- - name: Check for coverage override id: override env: @@ -116,9 +66,6 @@ jobs: echo "override=false" >> $GITHUB_OUTPUT echo "No coverage override found" fi - #---------------------------------------------- - # check coverage percentage - #---------------------------------------------- - name: Check coverage percentage if: steps.override.outputs.override == 'false' run: | @@ -127,20 +74,14 @@ jobs: echo "ERROR: Coverage file not found at $COVERAGE_FILE" exit 1 fi - - # Install xmllint if not available if ! command -v xmllint &> /dev/null; then sudo apt-get update && sudo apt-get install -y libxml2-utils fi - COVERED=$(xmllint --xpath "string(//coverage/@lines-covered)" "$COVERAGE_FILE") TOTAL=$(xmllint --xpath "string(//coverage/@lines-valid)" "$COVERAGE_FILE") PERCENTAGE=$(python3 -c "covered=${COVERED}; total=${TOTAL}; print(round((covered/total)*100, 2))") - echo "Branch Coverage: $PERCENTAGE%" echo "Required Coverage: 85%" - - # Use Python to compare the coverage with 85 python3 -c "import sys; sys.exit(0 if float('$PERCENTAGE') >= 85 else 1)" if [ $? -eq 1 ]; then echo "ERROR: Coverage is $PERCENTAGE%, which is less than the required 85%" @@ -148,19 +89,14 @@ jobs: else echo "SUCCESS: Coverage is $PERCENTAGE%, which meets the required 85%" fi - - #---------------------------------------------- - # coverage enforcement summary - #---------------------------------------------- - name: Coverage enforcement summary env: OVERRIDE: ${{ steps.override.outputs.override }} REASON: ${{ steps.override.outputs.reason }} run: | if [ "$OVERRIDE" == "true" ]; then - echo "⚠️ Coverage checks bypassed: $REASON" + echo "Coverage checks bypassed: $REASON" echo "Please ensure this override is justified and temporary" else - echo "✅ Coverage checks enforced - minimum 85% required" + echo "Coverage checks enforced - minimum 85% required" fi - diff --git a/.github/workflows/code-quality-checks.yml b/.github/workflows/code-quality-checks.yml index 13a889c56..ecc238263 100644 --- a/.github/workflows/code-quality-checks.yml +++ b/.github/workflows/code-quality-checks.yml @@ -4,95 +4,56 @@ on: [pull_request] permissions: contents: read + id-token: write jobs: run-unit-tests: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] dependency-version: ["default", "min"] - # Optimize matrix - test min/max on subset of Python versions exclude: - python-version: "3.12" dependency-version: "min" - python-version: "3.13" dependency-version: "min" - + name: "Unit Tests (Python ${{ matrix.python-version }}, ${{ matrix.dependency-version }} deps)" - + steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Set up python ${{ matrix.python-version }} - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + - name: Setup Poetry + uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ matrix.dependency-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #---------------------------------------------- - # install your root project, if required - #---------------------------------------------- - - name: Install library - run: poetry install --no-interaction - #---------------------------------------------- - # override with custom dependency versions - #---------------------------------------------- + cache-suffix: "${{ matrix.dependency-version }}-" - name: Install Python tools for custom versions if: matrix.dependency-version != 'default' run: poetry run pip install toml packaging - - name: Generate requirements file if: matrix.dependency-version != 'default' run: | poetry run python scripts/dependency_manager.py ${{ matrix.dependency-version }} --output requirements-${{ matrix.dependency-version }}.txt echo "Generated requirements for ${{ matrix.dependency-version }} versions:" cat requirements-${{ matrix.dependency-version }}.txt - - name: Override with custom dependency versions if: matrix.dependency-version != 'default' run: poetry run pip install -r requirements-${{ matrix.dependency-version }}.txt - - #---------------------------------------------- - # run test suite - #---------------------------------------------- - name: Show installed versions run: | echo "=== Dependency Version: ${{ matrix.dependency-version }} ===" poetry run pip list - - name: Run tests run: poetry run python -m pytest tests/unit + run-unit-tests-with-arrow: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] @@ -102,186 +63,74 @@ jobs: dependency-version: "min" - python-version: "3.13" dependency-version: "min" - - name: "Unit Tests + PyArrow (Python ${{ matrix.python-version }}, ${{ matrix.dependency-version }} deps)" - - steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - - name: Check out repository - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2 - - name: Set up python ${{ matrix.python-version }} - id: setup-python - uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e # v2 - with: - python-version: ${{ matrix.python-version }} - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv-pyarrow - key: venv-pyarrow-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ matrix.dependency-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #---------------------------------------------- - # install your root project, if required - #---------------------------------------------- - - name: Install Kerberos system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - - name: Install library - run: poetry install --no-interaction --all-extras - #---------------------------------------------- - # override with custom dependency versions - #---------------------------------------------- - - name: Install Python tools for custom versions - if: matrix.dependency-version != 'default' - run: poetry run pip install toml packaging - - name: Generate requirements file with pyarrow - if: matrix.dependency-version != 'default' - run: | - poetry run python scripts/dependency_manager.py ${{ matrix.dependency-version }} --output requirements-${{ matrix.dependency-version }}-arrow.txt - echo "Generated requirements for ${{ matrix.dependency-version }} versions with PyArrow:" - cat requirements-${{ matrix.dependency-version }}-arrow.txt + name: "Unit Tests + PyArrow (Python ${{ matrix.python-version }}, ${{ matrix.dependency-version }} deps)" - - name: Override with custom dependency versions - if: matrix.dependency-version != 'default' - run: poetry run pip install -r requirements-${{ matrix.dependency-version }}-arrow.txt - #---------------------------------------------- - # run test suite - #---------------------------------------------- - - name: Show installed versions - run: | - echo "=== Dependency Version: ${{ matrix.dependency-version }} with PyArrow ===" - poetry run pip list + steps: + - name: Check out repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev + - name: Setup Poetry + uses: ./.github/actions/setup-poetry + with: + python-version: ${{ matrix.python-version }} + install-args: "--all-extras" + cache-path: ".venv-pyarrow" + cache-suffix: "pyarrow-${{ matrix.dependency-version }}-" + - name: Install Python tools for custom versions + if: matrix.dependency-version != 'default' + run: poetry run pip install toml packaging + - name: Generate requirements file with pyarrow + if: matrix.dependency-version != 'default' + run: | + poetry run python scripts/dependency_manager.py ${{ matrix.dependency-version }} --output requirements-${{ matrix.dependency-version }}-arrow.txt + echo "Generated requirements for ${{ matrix.dependency-version }} versions with PyArrow:" + cat requirements-${{ matrix.dependency-version }}-arrow.txt + - name: Override with custom dependency versions + if: matrix.dependency-version != 'default' + run: poetry run pip install -r requirements-${{ matrix.dependency-version }}-arrow.txt + - name: Show installed versions + run: | + echo "=== Dependency Version: ${{ matrix.dependency-version }} with PyArrow ===" + poetry run pip list + - name: Run tests + run: poetry run python -m pytest tests/unit - - name: Run tests - run: poetry run python -m pytest tests/unit check-linting: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Set up python ${{ matrix.python-version }} - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + - name: Setup Poetry + uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #---------------------------------------------- - # install your root project, if required - #---------------------------------------------- - - name: Install library - run: poetry install --no-interaction - #---------------------------------------------- - # black the code - #---------------------------------------------- - name: Black run: poetry run black --check src check-types: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Set up python ${{ matrix.python-version }} - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + - name: Setup Poetry + uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #---------------------------------------------- - # install your root project, if required - #---------------------------------------------- - - name: Install library - run: poetry install --no-interaction - #---------------------------------------------- - # mypy the code - #---------------------------------------------- - name: Mypy run: | - mkdir .mypy_cache # Workaround for bad error message "error: --install-types failed (no mypy cache directory)"; see https://github.com/python/mypy/issues/10768#issuecomment-2178450153 + mkdir .mypy_cache poetry run mypy --install-types --non-interactive src diff --git a/.github/workflows/daily-telemetry-e2e.yml b/.github/workflows/daily-telemetry-e2e.yml index c3d0da5df..b6f78726c 100644 --- a/.github/workflows/daily-telemetry-e2e.yml +++ b/.github/workflows/daily-telemetry-e2e.yml @@ -3,7 +3,7 @@ name: Daily Telemetry E2E Tests on: schedule: - cron: '0 0 * * 0' # Run every Sunday at midnight UTC - + workflow_dispatch: # Allow manual triggering inputs: test_pattern: @@ -14,75 +14,39 @@ on: permissions: contents: read + id-token: write jobs: telemetry-e2e-tests: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest environment: azure-prod - + env: DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }} DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} DATABRICKS_CATALOG: peco DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} - + steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - - name: Set up python - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 - with: - python-version: "3.10" - - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - name: Install Kerberos system dependencies run: | sudo apt-get update sudo apt-get install -y libkrb5-dev - - name: Install dependencies - run: poetry install --no-interaction --all-extras - - #---------------------------------------------- - # run telemetry E2E tests - #---------------------------------------------- + - name: Setup Poetry + uses: ./.github/actions/setup-poetry + with: + python-version: "3.10" + install-args: "--all-extras" - name: Run telemetry E2E tests run: | TEST_PATTERN="${{ github.event.inputs.test_pattern || 'tests/e2e/test_telemetry_e2e.py' }}" echo "Running tests: $TEST_PATTERN" poetry run python -m pytest $TEST_PATTERN -v -s - - #---------------------------------------------- - # upload test results on failure - #---------------------------------------------- - name: Upload test results on failure if: failure() uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 @@ -92,4 +56,3 @@ jobs: .pytest_cache/ tests-unsafe.log retention-days: 7 - diff --git a/.github/workflows/dco-check.yml b/.github/workflows/dco-check.yml index b504ad84b..fdcf1b3bb 100644 --- a/.github/workflows/dco-check.yml +++ b/.github/workflows/dco-check.yml @@ -10,7 +10,9 @@ permissions: jobs: dco-check: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest name: Check DCO Sign-off steps: - name: Checkout diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 49dedfd91..6c0cc7059 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -1,17 +1,20 @@ name: Integration Tests on: - push: + push: branches: - main pull_request: permissions: contents: read + id-token: write jobs: run-non-telemetry-tests: - runs-on: ubuntu-latest + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest environment: azure-prod env: DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} @@ -20,59 +23,29 @@ jobs: DATABRICKS_CATALOG: peco DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Set up python - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 - with: - python-version: "3.10" - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 - with: - version: "2.2.1" - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - name: Install Kerberos system dependencies run: | sudo apt-get update sudo apt-get install -y libkrb5-dev - - name: Install dependencies - run: poetry install --no-interaction --all-extras - #---------------------------------------------- - # run test suite - #---------------------------------------------- + - name: Setup Poetry + uses: ./.github/actions/setup-poetry + with: + python-version: "3.10" + install-args: "--all-extras" - name: Run non-telemetry e2e tests run: | - # Exclude all telemetry tests - they run in separate job for isolation poetry run python -m pytest tests/e2e \ --ignore=tests/e2e/test_telemetry_e2e.py \ --ignore=tests/e2e/test_concurrent_telemetry.py \ -n auto run-telemetry-tests: - runs-on: ubuntu-latest - needs: run-non-telemetry-tests # Run after non-telemetry tests complete + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest + needs: run-non-telemetry-tests environment: azure-prod env: DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} @@ -83,32 +56,16 @@ jobs: steps: - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Set up python - id: setup-python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 - with: - python-version: "3.10" - name: Install system dependencies run: | sudo apt-get update sudo apt-get install -y libkrb5-dev - - name: Install Poetry - uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1 + - name: Setup Poetry + uses: ./.github/actions/setup-poetry with: - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ github.event.repository.name }}-${{ hashFiles('**/poetry.lock') }} - - name: Install dependencies - run: poetry install --no-interaction --all-extras + python-version: "3.10" + install-args: "--all-extras" - name: Run telemetry tests in isolation run: | - # Run test_concurrent_telemetry.py in isolation with complete process separation - # Use --dist=loadgroup to respect @pytest.mark.xdist_group markers poetry run python -m pytest tests/e2e/test_concurrent_telemetry.py \ - -n auto --dist=loadgroup -v \ No newline at end of file + -n auto --dist=loadgroup -v From 32c446b4e18487828bde5f97c9e26724fbbc4229 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 14 Apr 2026 01:05:16 +0530 Subject: [PATCH 17/24] [PECOBLR-1928] Add AI coding agent detection to User-Agent header (#740) Add AI coding agent detection to User-Agent header Detect when the Python SQL connector is invoked by an AI coding agent (e.g. Claude Code, Cursor, Gemini CLI) by checking well-known environment variables, and append `agent/` to the User-Agent string. This enables Databricks to understand how much driver usage originates from AI coding agents. Detection only succeeds when exactly one agent is detected to avoid ambiguous attribution. Mirrors the approach in databricks/cli#4287. Signed-off-by: Vikrant Puppala Co-authored-by: Claude Opus 4.6 --- src/databricks/sql/common/agent.py | 52 ++++++++++++++++++++++++++++++ src/databricks/sql/session.py | 5 +++ src/databricks/sql/utils.py | 6 ++++ tests/unit/test_agent_detection.py | 51 +++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+) create mode 100644 src/databricks/sql/common/agent.py create mode 100644 tests/unit/test_agent_detection.py diff --git a/src/databricks/sql/common/agent.py b/src/databricks/sql/common/agent.py new file mode 100644 index 000000000..79d1b2b7a --- /dev/null +++ b/src/databricks/sql/common/agent.py @@ -0,0 +1,52 @@ +""" +Detects whether the Python SQL connector is being invoked by an AI coding agent +by checking for well-known environment variables that agents set in their spawned +shell processes. + +Detection only succeeds when exactly one agent environment variable is present, +to avoid ambiguous attribution when multiple agent environments overlap. + +Adding a new agent requires only a new entry in KNOWN_AGENTS. + +References for each environment variable: + - ANTIGRAVITY_AGENT: Closed source. Google Antigravity sets this variable. + - CLAUDECODE: https://github.com/anthropics/claude-code (sets CLAUDECODE=1) + - CLINE_ACTIVE: https://github.com/cline/cline (shipped in v3.24.0) + - CODEX_CI: https://github.com/openai/codex (part of UNIFIED_EXEC_ENV array in codex-rs) + - CURSOR_AGENT: Closed source. Referenced in a gist by johnlindquist. + - GEMINI_CLI: https://google-gemini.github.io/gemini-cli/docs/tools/shell.html (sets GEMINI_CLI=1) + - OPENCODE: https://github.com/opencode-ai/opencode (sets OPENCODE=1) +""" + +import os + +KNOWN_AGENTS = [ + ("ANTIGRAVITY_AGENT", "antigravity"), + ("CLAUDECODE", "claude-code"), + ("CLINE_ACTIVE", "cline"), + ("CODEX_CI", "codex"), + ("CURSOR_AGENT", "cursor"), + ("GEMINI_CLI", "gemini-cli"), + ("OPENCODE", "opencode"), +] + + +def detect(env=None): + """Detect which AI coding agent (if any) is driving the current process. + + Args: + env: Optional dict-like object for environment variable lookup. + Defaults to os.environ. Exists for testability. + + Returns: + The agent product string if exactly one agent is detected, + or an empty string otherwise. + """ + if env is None: + env = os.environ + + detected = [product for var, product in KNOWN_AGENTS if env.get(var)] + + if len(detected) == 1: + return detected[0] + return "" diff --git a/src/databricks/sql/session.py b/src/databricks/sql/session.py index 0f723d144..1588d9f79 100644 --- a/src/databricks/sql/session.py +++ b/src/databricks/sql/session.py @@ -13,6 +13,7 @@ from databricks.sql.backend.databricks_client import DatabricksClient from databricks.sql.backend.types import SessionId, BackendType from databricks.sql.common.unified_http_client import UnifiedHttpClient +from databricks.sql.common.agent import detect as detect_agent logger = logging.getLogger(__name__) @@ -64,6 +65,10 @@ def __init__( else: self.useragent_header = "{}/{}".format(USER_AGENT_NAME, __version__) + agent_product = detect_agent() + if agent_product: + self.useragent_header += " agent/{}".format(agent_product) + base_headers = [("User-Agent", self.useragent_header)] all_headers = (http_headers or []) + base_headers diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index b1fff7202..ce2670969 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -957,12 +957,18 @@ def build_client_context(server_hostname: str, version: str, **kwargs): ) # Build user agent + from databricks.sql.common.agent import detect as detect_agent + user_agent_entry = kwargs.get("user_agent_entry", "") if user_agent_entry: user_agent = f"PyDatabricksSqlConnector/{version} ({user_agent_entry})" else: user_agent = f"PyDatabricksSqlConnector/{version}" + agent_product = detect_agent() + if agent_product: + user_agent += f" agent/{agent_product}" + # Explicitly construct ClientContext with proper types return ClientContext( hostname=server_hostname, diff --git a/tests/unit/test_agent_detection.py b/tests/unit/test_agent_detection.py new file mode 100644 index 000000000..0be404a1d --- /dev/null +++ b/tests/unit/test_agent_detection.py @@ -0,0 +1,51 @@ +import pytest +from databricks.sql.common.agent import detect, KNOWN_AGENTS + + +class TestAgentDetection: + def test_detects_single_agent_claude_code(self): + assert detect({"CLAUDECODE": "1"}) == "claude-code" + + def test_detects_single_agent_cursor(self): + assert detect({"CURSOR_AGENT": "1"}) == "cursor" + + def test_detects_single_agent_gemini_cli(self): + assert detect({"GEMINI_CLI": "1"}) == "gemini-cli" + + def test_detects_single_agent_cline(self): + assert detect({"CLINE_ACTIVE": "1"}) == "cline" + + def test_detects_single_agent_codex(self): + assert detect({"CODEX_CI": "1"}) == "codex" + + def test_detects_single_agent_opencode(self): + assert detect({"OPENCODE": "1"}) == "opencode" + + def test_detects_single_agent_antigravity(self): + assert detect({"ANTIGRAVITY_AGENT": "1"}) == "antigravity" + + def test_returns_empty_when_no_agent_detected(self): + assert detect({}) == "" + + def test_returns_empty_when_multiple_agents_detected(self): + assert detect({"CLAUDECODE": "1", "CURSOR_AGENT": "1"}) == "" + + def test_ignores_empty_env_var_values(self): + assert detect({"CLAUDECODE": ""}) == "" + + def test_all_known_agents_are_covered(self): + for env_var, product in KNOWN_AGENTS: + assert detect({env_var: "1"}) == product, ( + f"Agent with env var {env_var} should be detected as {product}" + ) + + def test_defaults_to_os_environ(self, monkeypatch): + monkeypatch.delenv("CLAUDECODE", raising=False) + monkeypatch.delenv("CURSOR_AGENT", raising=False) + monkeypatch.delenv("GEMINI_CLI", raising=False) + monkeypatch.delenv("CLINE_ACTIVE", raising=False) + monkeypatch.delenv("CODEX_CI", raising=False) + monkeypatch.delenv("OPENCODE", raising=False) + monkeypatch.delenv("ANTIGRAVITY_AGENT", raising=False) + # With all agent vars cleared, detect() should return empty + assert detect() == "" From c46b3a0f9fb8a41abb59281be6fcdecb83ed8aa1 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 14 Apr 2026 11:50:51 +0530 Subject: [PATCH 18/24] =?UTF-8?q?Optimize=20CI:=20consolidate=20workflows,?= =?UTF-8?q?=20fix=20caching,=20speed=20up=20e2e=20tests=20(47min=20?= =?UTF-8?q?=E2=86=92=2015min)=20(#772)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Optimize CI: consolidate workflows, fix caching, speed up e2e tests Workflow consolidation: - Delete integration.yml and daily-telemetry-e2e.yml (redundant with coverage workflow which already runs all e2e tests) - Add push-to-main trigger to coverage workflow - Run all tests (including telemetry) in single pytest invocation with --dist=loadgroup to respect xdist_group markers for isolation Fix pyarrow cache: - Remove cache-path: .venv-pyarrow from pyarrow jobs. Poetry always creates .venv regardless of the cache-path input, so the cache was never saved ("Path does not exist" error). The cache-suffix already differentiates keys between variants. Fix 3.14 post-test DNS hang: - Add enable_telemetry=False to unit test DUMMY_CONNECTION_ARGS that use server_hostname="foo". This prevents FeatureFlagsContext from making real HTTP calls to fake hosts, eliminating ~8min hang from ThreadPoolExecutor threads timing out on DNS on protected runners. Improve e2e test parallelization: - Split TestPySQLLargeQueriesSuite into 3 separate classes (TestPySQLLargeWideResultSet, TestPySQLLargeNarrowResultSet, TestPySQLLongRunningQuery) so xdist distributes them across workers instead of all landing on one. Speed up slow tests: - Reduce large result set sizes from 300MB to 100MB (still validates large fetches, lz4, chunking, row integrity) - Start test_long_running_query at scale_factor=50 instead of 1 to skip ramp-up iterations that finish instantly Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Further optimize e2e: 4 workers, lower long-query threshold, split lz4 - Use -n 4 instead of -n auto in coverage workflow. The e2e tests are network-bound (waiting on warehouse), not CPU-bound, so 4 workers on a 2-CPU runner is fine and doubles parallelism. - Lower test_long_running_query min_duration from 3 min to 1 min. The test validates long-running query completion — 1 minute is sufficient and saves ~4 min per variant. - Split lz4 on/off loop in test_query_with_large_wide_result_set into separate parametrized test cases so xdist can run them on different workers instead of sequentially in one test. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Address review: inline test methods, drop mixin pattern Per review feedback from jprakash-db: - Remove mixin classes (LargeWideResultSetMixin, etc) — inline the test methods directly into the test classes in test_driver.py - Remove backward-compat LargeQueriesMixin alias (nothing uses it) - Rename _LargeQueryRowHelper — replaced entirely by inlining - Convert large_queries_mixin.py to just a fetch_rows() helper function Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --------- Signed-off-by: Vikrant Puppala --- .github/workflows/code-coverage.yml | 24 ++-- .github/workflows/code-quality-checks.yml | 1 - .github/workflows/daily-telemetry-e2e.yml | 58 -------- .github/workflows/integration.yml | 71 ---------- tests/e2e/common/large_queries_mixin.py | 158 +++++----------------- tests/e2e/test_driver.py | 89 ++++++++++-- tests/unit/test_client.py | 2 + tests/unit/test_session.py | 3 + 8 files changed, 123 insertions(+), 283 deletions(-) delete mode 100644 .github/workflows/daily-telemetry-e2e.yml delete mode 100644 .github/workflows/integration.yml diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index c188c5b3a..9f578ec9f 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -1,10 +1,15 @@ -name: Code Coverage +name: E2E Tests and Code Coverage permissions: contents: read id-token: write -on: [pull_request, workflow_dispatch] +on: + push: + branches: + - main + pull_request: + workflow_dispatch: jobs: test-with-coverage: @@ -32,25 +37,16 @@ jobs: with: python-version: "3.10" install-args: "--all-extras" - - name: Run parallel tests with coverage + - name: Run all tests with coverage continue-on-error: false run: | poetry run pytest tests/unit tests/e2e \ - -m "not serial" \ - -n auto \ + -n 4 \ + --dist=loadgroup \ --cov=src \ --cov-report=xml \ --cov-report=term \ -v - - name: Run telemetry tests with coverage (isolated) - continue-on-error: false - run: | - poetry run pytest tests/e2e/test_concurrent_telemetry.py \ - --cov=src \ - --cov-append \ - --cov-report=xml \ - --cov-report=term \ - -v - name: Check for coverage override id: override env: diff --git a/.github/workflows/code-quality-checks.yml b/.github/workflows/code-quality-checks.yml index ecc238263..4071a6e51 100644 --- a/.github/workflows/code-quality-checks.yml +++ b/.github/workflows/code-quality-checks.yml @@ -78,7 +78,6 @@ jobs: with: python-version: ${{ matrix.python-version }} install-args: "--all-extras" - cache-path: ".venv-pyarrow" cache-suffix: "pyarrow-${{ matrix.dependency-version }}-" - name: Install Python tools for custom versions if: matrix.dependency-version != 'default' diff --git a/.github/workflows/daily-telemetry-e2e.yml b/.github/workflows/daily-telemetry-e2e.yml deleted file mode 100644 index b6f78726c..000000000 --- a/.github/workflows/daily-telemetry-e2e.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: Daily Telemetry E2E Tests - -on: - schedule: - - cron: '0 0 * * 0' # Run every Sunday at midnight UTC - - workflow_dispatch: # Allow manual triggering - inputs: - test_pattern: - description: 'Test pattern to run (default: tests/e2e/test_telemetry_e2e.py)' - required: false - default: 'tests/e2e/test_telemetry_e2e.py' - type: string - -permissions: - contents: read - id-token: write - -jobs: - telemetry-e2e-tests: - runs-on: - group: databricks-protected-runner-group - labels: linux-ubuntu-latest - environment: azure-prod - - env: - DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} - DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }} - DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} - DATABRICKS_CATALOG: peco - DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} - - steps: - - name: Check out repository - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Install Kerberos system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - - name: Setup Poetry - uses: ./.github/actions/setup-poetry - with: - python-version: "3.10" - install-args: "--all-extras" - - name: Run telemetry E2E tests - run: | - TEST_PATTERN="${{ github.event.inputs.test_pattern || 'tests/e2e/test_telemetry_e2e.py' }}" - echo "Running tests: $TEST_PATTERN" - poetry run python -m pytest $TEST_PATTERN -v -s - - name: Upload test results on failure - if: failure() - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 - with: - name: telemetry-test-results - path: | - .pytest_cache/ - tests-unsafe.log - retention-days: 7 diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml deleted file mode 100644 index 6c0cc7059..000000000 --- a/.github/workflows/integration.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: Integration Tests - -on: - push: - branches: - - main - pull_request: - -permissions: - contents: read - id-token: write - -jobs: - run-non-telemetry-tests: - runs-on: - group: databricks-protected-runner-group - labels: linux-ubuntu-latest - environment: azure-prod - env: - DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} - DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }} - DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} - DATABRICKS_CATALOG: peco - DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} - steps: - - name: Check out repository - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Install Kerberos system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - - name: Setup Poetry - uses: ./.github/actions/setup-poetry - with: - python-version: "3.10" - install-args: "--all-extras" - - name: Run non-telemetry e2e tests - run: | - poetry run python -m pytest tests/e2e \ - --ignore=tests/e2e/test_telemetry_e2e.py \ - --ignore=tests/e2e/test_concurrent_telemetry.py \ - -n auto - - run-telemetry-tests: - runs-on: - group: databricks-protected-runner-group - labels: linux-ubuntu-latest - needs: run-non-telemetry-tests - environment: azure-prod - env: - DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} - DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }} - DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} - DATABRICKS_CATALOG: peco - DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} - steps: - - name: Check out repository - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libkrb5-dev - - name: Setup Poetry - uses: ./.github/actions/setup-poetry - with: - python-version: "3.10" - install-args: "--all-extras" - - name: Run telemetry tests in isolation - run: | - poetry run python -m pytest tests/e2e/test_concurrent_telemetry.py \ - -n auto --dist=loadgroup -v diff --git a/tests/e2e/common/large_queries_mixin.py b/tests/e2e/common/large_queries_mixin.py index dd7c56996..7255ee095 100644 --- a/tests/e2e/common/large_queries_mixin.py +++ b/tests/e2e/common/large_queries_mixin.py @@ -2,139 +2,43 @@ import math import time -import pytest - log = logging.getLogger(__name__) -class LargeQueriesMixin: +def fetch_rows(test_case, cursor, row_count, fetchmany_size): """ - This mixin expects to be mixed with a CursorTest-like class + A generator for rows. Fetches until the end or up to 5 minutes. """ - - def fetch_rows(self, cursor, row_count, fetchmany_size): - """ - A generator for rows. Fetches until the end or up to 5 minutes. - """ - # TODO: Remove fetchmany_size when we have fixed the performance issues with fetchone - # in the Python client - max_fetch_time = 5 * 60 # Fetch for at most 5 minutes - - rows = self.get_some_rows(cursor, fetchmany_size) - start_time = time.time() - n = 0 - while rows: - for row in rows: - n += 1 - yield row - if time.time() - start_time >= max_fetch_time: - log.warning("Fetching rows timed out") - break - rows = self.get_some_rows(cursor, fetchmany_size) - if not rows: - # Read all the rows, row_count should match - self.assertEqual(n, row_count) - - num_fetches = max(math.ceil(n / 10000), 1) - latency_ms = int((time.time() - start_time) * 1000 / num_fetches), 1 - print( - "Fetched {} rows with an avg latency of {} per fetch, ".format( - n, latency_ms - ) - + "assuming 10K fetch size." + max_fetch_time = 5 * 60 # Fetch for at most 5 minutes + + rows = _get_some_rows(cursor, fetchmany_size) + start_time = time.time() + n = 0 + while rows: + for row in rows: + n += 1 + yield row + if time.time() - start_time >= max_fetch_time: + log.warning("Fetching rows timed out") + break + rows = _get_some_rows(cursor, fetchmany_size) + if not rows: + # Read all the rows, row_count should match + test_case.assertEqual(n, row_count) + + num_fetches = max(math.ceil(n / 10000), 1) + latency_ms = int((time.time() - start_time) * 1000 / num_fetches), 1 + print( + "Fetched {} rows with an avg latency of {} per fetch, ".format( + n, latency_ms ) - - @pytest.mark.parametrize( - "extra_params", - [ - {}, - {"use_sea": True}, - ], + + "assuming 10K fetch size." ) - def test_query_with_large_wide_result_set(self, extra_params): - resultSize = 300 * 1000 * 1000 # 300 MB - width = 8192 # B - rows = resultSize // width - cols = width // 36 - - # Set the fetchmany_size to get 10MB of data a go - fetchmany_size = 10 * 1024 * 1024 // width - # This is used by PyHive tests to determine the buffer size - self.arraysize = 1000 - with self.cursor(extra_params) as cursor: - for lz4_compression in [False, True]: - cursor.connection.lz4_compression = lz4_compression - uuids = ", ".join(["uuid() uuid{}".format(i) for i in range(cols)]) - cursor.execute( - "SELECT id, {uuids} FROM RANGE({rows})".format( - uuids=uuids, rows=rows - ) - ) - assert lz4_compression == cursor.active_result_set.lz4_compressed - for row_id, row in enumerate( - self.fetch_rows(cursor, rows, fetchmany_size) - ): - assert row[0] == row_id # Verify no rows are dropped in the middle. - assert len(row[1]) == 36 - - @pytest.mark.parametrize( - "extra_params", - [ - {}, - {"use_sea": True}, - ], - ) - def test_query_with_large_narrow_result_set(self, extra_params): - resultSize = 300 * 1000 * 1000 # 300 MB - width = 8 # sizeof(long) - rows = resultSize / width - - # Set the fetchmany_size to get 10MB of data a go - fetchmany_size = 10 * 1024 * 1024 // width - # This is used by PyHive tests to determine the buffer size - self.arraysize = 10000000 - with self.cursor(extra_params) as cursor: - cursor.execute("SELECT * FROM RANGE({rows})".format(rows=rows)) - for row_id, row in enumerate(self.fetch_rows(cursor, rows, fetchmany_size)): - assert row[0] == row_id - - @pytest.mark.parametrize( - "extra_params", - [ - {}, - {"use_sea": True}, - ], - ) - def test_long_running_query(self, extra_params): - """Incrementally increase query size until it takes at least 3 minutes, - and asserts that the query completes successfully. - """ - minutes = 60 - min_duration = 3 * minutes - - duration = -1 - scale0 = 10000 - scale_factor = 1 - with self.cursor(extra_params) as cursor: - while duration < min_duration: - assert scale_factor < 4096, "Detected infinite loop" - start = time.time() - - cursor.execute( - """SELECT count(*) - FROM RANGE({scale}) x - JOIN RANGE({scale0}) y - ON from_unixtime(x.id * y.id, "yyyy-MM-dd") LIKE "%not%a%date%" - """.format( - scale=scale_factor * scale0, scale0=scale0 - ) - ) - (n,) = cursor.fetchone() - assert n == 0 - duration = time.time() - start - current_fraction = duration / min_duration - print("Took {} s with scale factor={}".format(duration, scale_factor)) - # Extrapolate linearly to reach 3 min and add 50% padding to push over the limit - scale_factor = math.ceil(1.5 * scale_factor / current_fraction) +def _get_some_rows(cursor, fetchmany_size): + row = cursor.fetchone() + if row: + return [row] + else: + return None diff --git a/tests/e2e/test_driver.py b/tests/e2e/test_driver.py index e04e348c9..45b56ae08 100644 --- a/tests/e2e/test_driver.py +++ b/tests/e2e/test_driver.py @@ -39,7 +39,7 @@ ) from databricks.sql.thrift_api.TCLIService import ttypes from tests.e2e.common.core_tests import CoreTestMixin, SmokeTestMixin -from tests.e2e.common.large_queries_mixin import LargeQueriesMixin +from tests.e2e.common.large_queries_mixin import fetch_rows from tests.e2e.common.timestamp_tests import TimestampTestsMixin from tests.e2e.common.decimal_tests import DecimalTestsMixin from tests.e2e.common.retry_test_mixins import ( @@ -138,24 +138,89 @@ def assertEqualRowValues(self, actual, expected): assert act[i] == exp[i] -class TestPySQLLargeQueriesSuite(PySQLPytestTestCase, LargeQueriesMixin): - def get_some_rows(self, cursor, fetchmany_size): - row = cursor.fetchone() - if row: - return [row] - else: - return None +class TestPySQLLargeWideResultSet(PySQLPytestTestCase): + @pytest.mark.parametrize("extra_params", [{}, {"use_sea": True}]) + @pytest.mark.parametrize("lz4_compression", [False, True]) + def test_query_with_large_wide_result_set(self, extra_params, lz4_compression): + resultSize = 100 * 1000 * 1000 # 100 MB + width = 8192 # B + rows = resultSize // width + cols = width // 36 + fetchmany_size = 10 * 1024 * 1024 // width + self.arraysize = 1000 + with self.cursor(extra_params) as cursor: + cursor.connection.lz4_compression = lz4_compression + uuids = ", ".join(["uuid() uuid{}".format(i) for i in range(cols)]) + cursor.execute( + "SELECT id, {uuids} FROM RANGE({rows})".format( + uuids=uuids, rows=rows + ) + ) + assert lz4_compression == cursor.active_result_set.lz4_compressed + for row_id, row in enumerate( + fetch_rows(self, cursor, rows, fetchmany_size) + ): + assert row[0] == row_id + assert len(row[1]) == 36 + + +class TestPySQLLargeNarrowResultSet(PySQLPytestTestCase): + @pytest.mark.parametrize("extra_params", [{}, {"use_sea": True}]) + def test_query_with_large_narrow_result_set(self, extra_params): + resultSize = 100 * 1000 * 1000 # 100 MB + width = 8 # sizeof(long) + rows = resultSize / width + fetchmany_size = 10 * 1024 * 1024 // width + self.arraysize = 10000000 + with self.cursor(extra_params) as cursor: + cursor.execute("SELECT * FROM RANGE({rows})".format(rows=rows)) + for row_id, row in enumerate( + fetch_rows(self, cursor, rows, fetchmany_size) + ): + assert row[0] == row_id + + +class TestPySQLLongRunningQuery(PySQLPytestTestCase): + @pytest.mark.parametrize("extra_params", [{}, {"use_sea": True}]) + def test_long_running_query(self, extra_params): + """Incrementally increase query size until it takes at least 1 minute, + and asserts that the query completes successfully. + """ + import math + + minutes = 60 + min_duration = 1 * minutes + duration = -1 + scale0 = 10000 + scale_factor = 50 + with self.cursor(extra_params) as cursor: + while duration < min_duration: + assert scale_factor < 4096, "Detected infinite loop" + start = time.time() + cursor.execute( + """SELECT count(*) + FROM RANGE({scale}) x + JOIN RANGE({scale0}) y + ON from_unixtime(x.id * y.id, "yyyy-MM-dd") LIKE "%not%a%date%" + """.format( + scale=scale_factor * scale0, scale0=scale0 + ) + ) + (n,) = cursor.fetchone() + assert n == 0 + duration = time.time() - start + current_fraction = duration / min_duration + print("Took {} s with scale factor={}".format(duration, scale_factor)) + scale_factor = math.ceil(1.5 * scale_factor / current_fraction) + +class TestPySQLCloudFetch(PySQLPytestTestCase): @skipUnless(pysql_supports_arrow(), "needs arrow support") @pytest.mark.skip("This test requires a previously uploaded data set") def test_cloud_fetch(self): - # This test can take several minutes to run limits = [100000, 300000] threads = [10, 25] self.arraysize = 100000 - # This test requires a large table with many rows to properly initiate cloud fetch. - # e2-dogfood host > hive_metastore catalog > main schema has such a table called store_sales. - # If this table is deleted or this test is run on a different host, a different table may need to be used. base_query = "SELECT * FROM store_sales WHERE ss_sold_date_sk = 2452234 " for num_limit, num_threads, lz4_compression in itertools.product( limits, threads, [True, False] diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 5b6991931..4a8cb0b68 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -87,6 +87,7 @@ class ClientTestSuite(unittest.TestCase): "server_hostname": "foo", "http_path": "dummy_path", "access_token": "tok", + "enable_telemetry": False, } @patch("%s.session.ThriftDatabricksClient" % PACKAGE_NAME) @@ -644,6 +645,7 @@ class TransactionTestSuite(unittest.TestCase): "server_hostname": "foo", "http_path": "dummy_path", "access_token": "tok", + "enable_telemetry": False, } def _setup_mock_session_with_http_client(self, mock_session): diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 3a43c1a75..aa7e7f02b 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -22,6 +22,7 @@ class TestSession: "server_hostname": "foo", "http_path": "dummy_path", "access_token": "tok", + "enable_telemetry": False, } @patch("%s.session.ThriftDatabricksClient" % PACKAGE_NAME) @@ -50,6 +51,7 @@ def test_auth_args(self, mock_client_class): "server_hostname": "foo", "http_path": None, "access_token": "tok", + "enable_telemetry": False, }, { "server_hostname": "foo", @@ -57,6 +59,7 @@ def test_auth_args(self, mock_client_class): "_tls_client_cert_file": "something", "_use_cert_as_auth": True, "access_token": None, + "enable_telemetry": False, }, ] From 9031863fb6380730e7b58b223739ee9028c2a720 Mon Sep 17 00:00:00 2001 From: Korijn van Golen Date: Mon, 20 Apr 2026 05:37:56 +0200 Subject: [PATCH 19/24] Bump thrift to fix deprecation warning (#733) Signed-off-by: Korijn van Golen --- poetry.lock | 21 +++++++++------------ pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7d0845a58..5644190f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.1 and should not be changed by hand. [[package]] name = "astroid" @@ -1158,15 +1158,15 @@ pytz = ">=2020.1" tzdata = ">=2022.1" [package.extras] -all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] -aws = ["s3fs (>=2021.08.0)"] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.7.0)", "gcsfs (>=2021.7.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.8.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.8.0)"] clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] feather = ["pyarrow (>=7.0.0)"] -fss = ["fsspec (>=2021.07.0)"] -gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +fss = ["fsspec (>=2021.7.0)"] +gcp = ["gcsfs (>=2021.7.0)", "pandas-gbq (>=0.15.0)"] hdf5 = ["tables (>=3.6.1)"] html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] @@ -1523,7 +1523,7 @@ files = [ ] [package.dependencies] -astroid = ">=3.2.4,<=3.3.0-dev0" +astroid = ">=3.2.4,<=3.3.0.dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ {version = ">=0.2", markers = "python_version < \"3.11\""}, @@ -1849,18 +1849,15 @@ files = [ [[package]] name = "thrift" -version = "0.20.0" +version = "0.22.0" description = "Python bindings for the Apache Thrift RPC system" optional = false python-versions = "*" groups = ["main"] files = [ - {file = "thrift-0.20.0.tar.gz", hash = "sha256:4dd662eadf6b8aebe8a41729527bd69adf6ceaa2a8681cbef64d1273b3e8feba"}, + {file = "thrift-0.22.0.tar.gz", hash = "sha256:42e8276afbd5f54fe1d364858b6877bc5e5a4a5ed69f6a005b94ca4918fe1466"}, ] -[package.dependencies] -six = ">=1.7.2" - [package.extras] all = ["tornado (>=4.0)", "twisted"] tornado = ["tornado (>=4.0)"] @@ -1969,4 +1966,4 @@ pyarrow = ["pyarrow", "pyarrow", "pyarrow"] [metadata] lock-version = "2.1" python-versions = "^3.8.0" -content-hash = "ec311bf26ec866de2f427bcdf4ec69ceed721bfd70edfae3aba1ac12882a09d6" +content-hash = "d1739e84dcbd6e7ac311eb6fbb9cf87ad110491f7d954f07fdfc32b704b4413f" diff --git a/pyproject.toml b/pyproject.toml index 911f1b79c..e1ce3b73f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ include = ["CHANGELOG.md"] [tool.poetry.dependencies] python = "^3.8.0" -thrift = ">=0.16.0,<0.21.0" +thrift = "~=0.22.0" pandas = [ { version = ">=1.2.5,<2.4.0", python = ">=3.8,<3.13" }, { version = ">=2.2.3,<2.4.0", python = ">=3.13" } From b088a356ade14ab474e88e2978bfc1052adbd248 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 21 Apr 2026 12:34:58 +0530 Subject: [PATCH 20/24] Fix dependency_manager: handle PEP 440 ~= compatible release syntax (#776) The _extract_versions_from_specifier function stripped a single `~` character from constraint strings, which corrupted PEP 440 compatible release syntax (`~=`) by leaving a stray `=`. For example, `thrift = "~=0.22.0"` produced the invalid constraint `thrift>==0.22.0,<=0.23.0`, breaking every PR's "Unit Tests (min deps)" job since #733 was merged. Add an explicit branch for `~=` that strips both characters before extracting the minimum version. The Poetry-style single `~` branch is preserved for backward compatibility. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- scripts/dependency_manager.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/dependency_manager.py b/scripts/dependency_manager.py index 15e119841..29c5fe828 100644 --- a/scripts/dependency_manager.py +++ b/scripts/dependency_manager.py @@ -69,16 +69,21 @@ def _parse_constraint(self, name, constraint): def _extract_versions_from_specifier(self, spec_set_str): """Extract minimum version from a specifier set""" try: - # Handle caret (^) and tilde (~) constraints that packaging doesn't support + # Handle caret (^) and tilde (~, ~=) constraints that packaging doesn't + # support (Poetry ^, Poetry ~, and PEP 440 ~=). if spec_set_str.startswith('^'): # ^1.2.3 means >=1.2.3, <2.0.0 min_version = spec_set_str[1:] # Remove ^ return min_version, None + elif spec_set_str.startswith('~='): + # PEP 440 compatible release: ~=1.2.3 means >=1.2.3, <1.3.0 + min_version = spec_set_str[2:] # Remove ~= + return min_version, None elif spec_set_str.startswith('~'): - # ~1.2.3 means >=1.2.3, <1.3.0 + # Poetry tilde: ~1.2.3 means >=1.2.3, <1.3.0 min_version = spec_set_str[1:] # Remove ~ return min_version, None - + spec_set = SpecifierSet(spec_set_str) min_version = None From d872075e2127858379267a11f37d44b5273640e3 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 21 Apr 2026 15:21:17 +0530 Subject: [PATCH 21/24] [PECOBLR-2461] Add comprehensive MST transaction E2E tests (#775) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add comprehensive MST transaction E2E tests Replaces the prior speculative test skeleton with 42 tests across 5 categories: - TestMstCorrectness (18): commit/rollback/isolation/multi-table atomicity/repeatable reads/write conflict/parameterized DML/etc. - TestMstApi (6): DB-API-specific — autocommit, isolation level, error handling. - TestMstMetadata (6): cursor.columns/tables/schemas/catalogs inside a transaction, plus two freshness tests asserting Thrift metadata RPCs are non-transactional (they see concurrent DDL that the txn should not see). - TestMstBlockedSql (9): MSTCheckRule enforcement. Some SHOW/DESCRIBE commands throw + abort txn, others succeed silently on Python/Thrift (diverges from JDBC). Both behaviors are explicitly tested so regressions in either direction are caught. - TestMstExecuteVariants (2): executemany commit/rollback. Parallelisation: - Each test uses a unique Delta table derived from its test name so pytest-xdist workers don't collide on shared state. - Tests that spawn concurrent connections to the same table (repeatable reads, write conflict, freshness) use xdist_group so the concurrent connections within a single test don't conflict with other tests on different workers. Runtime: ~2 minutes on 4 workers (pytest -n 4 --dist=loadgroup), well within the existing e2e budget. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Fix TestMstBlockedSql: SHOW COLUMNS and DESCRIBE QUERY are blocked CI caught that the initial "not blocked" assertions were wrong — the server returns TRANSACTION_NOT_SUPPORTED.COMMAND for SHOW COLUMNS (ShowDeltaTableColumnsCommand) and DESCRIBE QUERY (DescribeQueryCommand) inside an active transaction. The server's error message explicitly lists the allowed commands: "Only SELECT / INSERT / MERGE / UPDATE / DELETE / DESCRIBE TABLE are supported." DESCRIBE TABLE (basic) remains the only DESCRIBE variant that is allowed. Earlier dogfood runs showed SHOW COLUMNS / DESCRIBE QUERY succeeding — likely because the dogfood warehouse DBR is older than CI. Aligning tests with the current/CI server behavior. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala * Address PR review comments - test_auto_start_after_commit: assert the rolled-back id=2 is NOT present (use _get_ids set equality instead of just row count). - test_auto_start_after_rollback: same pattern — assert the rolled-back id=1 is NOT present. - test_commit_without_active_txn_throws: match specific NO_ACTIVE_TRANSACTION server error code to ensure we're catching the right exception, not an unrelated one. Add _get_ids() helper for checking the exact set of persisted ids. Verified 42/42 pass against pecotesting in ~1:36 (4 workers). Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --------- Signed-off-by: Vikrant Puppala --- tests/e2e/test_transactions.py | 1214 ++++++++++++++++++-------------- 1 file changed, 692 insertions(+), 522 deletions(-) diff --git a/tests/e2e/test_transactions.py b/tests/e2e/test_transactions.py index d4f6a790a..9bf74baad 100644 --- a/tests/e2e/test_transactions.py +++ b/tests/e2e/test_transactions.py @@ -1,598 +1,768 @@ """ End-to-end integration tests for Multi-Statement Transaction (MST) APIs. -These tests verify: -- autocommit property (getter/setter) -- commit() and rollback() methods -- get_transaction_isolation() and set_transaction_isolation() methods -- Transaction error handling +Tests driver behavior for MST across: +- Basic correctness (commit/rollback/isolation/multi-table) +- API-specific (autocommit, isolation level, error handling) +- Metadata RPCs inside transactions (non-transactional freshness) +- SQL statements blocked by MSTCheckRule (SHOW, DESCRIBE, information_schema) +- Execute variants (executemany) + +Parallelisation: +- Each test uses its own unique table (derived from test name) to allow + parallel execution with pytest-xdist. +- Tests requiring multiple concurrent connections to the same table are + tagged with xdist_group so the concurrent connections within a single + test don't conflict with other tests on different workers. Requirements: - DBSQL warehouse that supports Multi-Statement Transactions (MST) -- Test environment configured via test.env file or environment variables - -Setup: -Set the following environment variables: -- DATABRICKS_SERVER_HOSTNAME -- DATABRICKS_HTTP_PATH -- DATABRICKS_ACCESS_TOKEN (or use OAuth) - -Usage: - pytest tests/e2e/test_transactions.py -v +- Env vars: DATABRICKS_SERVER_HOSTNAME, DATABRICKS_HTTP_PATH, + DATABRICKS_TOKEN, DATABRICKS_CATALOG, DATABRICKS_SCHEMA """ import logging import os +import re +import uuid + import pytest -from typing import Any, Dict import databricks.sql as sql -from databricks.sql import TransactionError, NotSupportedError, InterfaceError logger = logging.getLogger(__name__) -@pytest.mark.skip( - reason="Test environment does not yet support multi-statement transactions" -) -class TestTransactions: - """E2E tests for transaction control methods (MST support).""" +def _unique_table_name(request): + """Derive a unique Delta table name from the test node id.""" + node_id = request.node.name + sanitized = re.sub(r"[^a-z0-9_]", "_", node_id.lower()) + return f"mst_pysql_{sanitized}"[:80] - # Test table name - TEST_TABLE_NAME = "transaction_test_table" - @pytest.fixture(autouse=True) - def setup_and_teardown(self, connection_details): - """Setup test environment before each test and cleanup after.""" - self.connection_params = { - "server_hostname": connection_details["host"], - "http_path": connection_details["http_path"], - "access_token": connection_details.get("access_token"), - "ignore_transactions": False, # Enable actual transaction functionality for these tests - } +def _unique_table_name_raw(suffix): + """Non-fixture unique table name helper for extra tables within a test.""" + return f"mst_pysql_{suffix}_{uuid.uuid4().hex[:8]}" - # Get catalog and schema from environment or use defaults - self.catalog = os.getenv("DATABRICKS_CATALOG", "main") - self.schema = os.getenv("DATABRICKS_SCHEMA", "default") - # Create connection for setup - self.connection = sql.connect(**self.connection_params) +@pytest.fixture +def mst_conn_params(connection_details): + """Connection parameters with MST enabled.""" + return { + "server_hostname": connection_details["host"], + "http_path": connection_details["http_path"], + "access_token": connection_details.get("access_token"), + "ignore_transactions": False, + } - # Setup: Create test table - self._create_test_table() - yield +@pytest.fixture +def mst_catalog(connection_details): + return connection_details.get("catalog") or os.getenv("DATABRICKS_CATALOG") or "main" - # Teardown: Cleanup - self._cleanup() - def _get_fully_qualified_table_name(self) -> str: - """Get the fully qualified table name.""" - return f"{self.catalog}.{self.schema}.{self.TEST_TABLE_NAME}" +@pytest.fixture +def mst_schema(connection_details): + return connection_details.get("schema") or os.getenv("DATABRICKS_SCHEMA") or "default" - def _create_test_table(self): - """Create the test table with Delta format and MST support.""" - fq_table_name = self._get_fully_qualified_table_name() - cursor = self.connection.cursor() - try: - # Drop if exists - cursor.execute(f"DROP TABLE IF EXISTS {fq_table_name}") +@pytest.fixture +def mst_table(request, mst_conn_params, mst_catalog, mst_schema): + """Create a fresh Delta table for the test and drop it afterwards. + + Yields (fq_table_name, table_name). The table is unique per test so tests + can run in parallel without stepping on each other. + """ + table_name = _unique_table_name(request) + fq_table = f"{mst_catalog}.{mst_schema}.{table_name}" - # Create table with Delta and catalog-owned feature for MST compatibility + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {fq_table}") cursor.execute( - f""" - CREATE TABLE IF NOT EXISTS {fq_table_name} - (id INT, value STRING) - USING DELTA - TBLPROPERTIES ('delta.feature.catalogOwned-preview' = 'supported') - """ + f"CREATE TABLE {fq_table} (id INT, value STRING) USING DELTA " + f"TBLPROPERTIES ('delta.feature.catalogManaged' = 'supported')" ) - logger.info(f"Created test table: {fq_table_name}") - finally: - cursor.close() - - def _cleanup(self): - """Cleanup after test: rollback pending transactions, drop table, close connection.""" - try: - # Try to rollback any pending transaction - if ( - self.connection - and self.connection.open - and not self.connection.autocommit - ): - try: - self.connection.rollback() - except Exception as e: - logger.debug( - f"Rollback during cleanup failed (may be expected): {e}" + yield fq_table, table_name + + try: + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {fq_table}") + except Exception as e: + logger.warning(f"Failed to drop {fq_table}: {e}") + + +def _get_row_count(mst_conn_params, fq_table): + """Count rows from a fresh connection (avoids in-txn caching).""" + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"SELECT COUNT(*) FROM {fq_table}") + return cursor.fetchone()[0] + + +def _get_ids(mst_conn_params, fq_table): + """Return the set of ids from a fresh connection.""" + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"SELECT id FROM {fq_table}") + return {row[0] for row in cursor.fetchall()} + + +# ==================== A. BASIC CORRECTNESS ==================== + + +class TestMstCorrectness: + """Core MST correctness: commit, rollback, isolation, multi-table.""" + + def test_commit_single_insert(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'committed')") + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 1 + + def test_commit_multiple_inserts(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'a')") + cursor.execute(f"INSERT INTO {fq_table} VALUES (2, 'b')") + cursor.execute(f"INSERT INTO {fq_table} VALUES (3, 'c')") + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 3 + + def test_rollback_single_insert(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'rolled_back')") + conn.rollback() + + assert _get_row_count(mst_conn_params, fq_table) == 0 + + def test_sequential_transactions(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'txn1')") + conn.commit() + + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (2, 'txn2')") + conn.rollback() + + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (3, 'txn3')") + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 2 + + def test_auto_start_after_commit(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'txn1')") + conn.commit() + + # Second INSERT auto-starts a new transaction + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (2, 'txn2')") + conn.rollback() + + assert _get_ids(mst_conn_params, fq_table) == {1} + + def test_auto_start_after_rollback(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'txn1')") + conn.rollback() + + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (2, 'txn2')") + conn.commit() + + assert _get_ids(mst_conn_params, fq_table) == {2} + + def test_update_in_transaction(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'original')") + + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"UPDATE {fq_table} SET value = 'updated' WHERE id = 1") + conn.commit() + + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"SELECT value FROM {fq_table} WHERE id = 1") + assert cursor.fetchone()[0] == "updated" + + def test_delete_in_transaction(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'a')") + cursor.execute(f"INSERT INTO {fq_table} VALUES (2, 'b')") + + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"DELETE FROM {fq_table} WHERE id = 1") + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 1 + + def test_multi_table_commit(self, mst_conn_params, mst_table, mst_catalog, mst_schema): + fq_table1, _ = mst_table + fq_table2 = f"{mst_catalog}.{mst_schema}.{_unique_table_name_raw('multi_commit_t2')}" + + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {fq_table2}") + cursor.execute( + f"CREATE TABLE {fq_table2} (id INT, value STRING) USING DELTA " + f"TBLPROPERTIES ('delta.feature.catalogManaged' = 'supported')" + ) + try: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table1} VALUES (1, 't1')") + cursor.execute(f"INSERT INTO {fq_table2} VALUES (1, 't2')") + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table1) == 1 + assert _get_row_count(mst_conn_params, fq_table2) == 1 + finally: + conn.autocommit = True + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {fq_table2}") + + def test_multi_table_rollback(self, mst_conn_params, mst_table, mst_catalog, mst_schema): + fq_table1, _ = mst_table + fq_table2 = f"{mst_catalog}.{mst_schema}.{_unique_table_name_raw('multi_rb_t2')}" + + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {fq_table2}") + cursor.execute( + f"CREATE TABLE {fq_table2} (id INT, value STRING) USING DELTA " + f"TBLPROPERTIES ('delta.feature.catalogManaged' = 'supported')" + ) + try: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table1} VALUES (1, 't1')") + cursor.execute(f"INSERT INTO {fq_table2} VALUES (1, 't2')") + conn.rollback() + + assert _get_row_count(mst_conn_params, fq_table1) == 0 + assert _get_row_count(mst_conn_params, fq_table2) == 0 + finally: + conn.autocommit = True + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {fq_table2}") + + def test_multi_table_atomicity(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'should_rollback')") + with pytest.raises(Exception): + cursor.execute( + "INSERT INTO nonexistent_table_xyz_xyz VALUES (1, 'fail')" + ) + conn.rollback() + + assert _get_row_count(mst_conn_params, fq_table) == 0 + + @pytest.mark.xdist_group(name="mst_repeatable_reads") + def test_repeatable_reads(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'initial')") + + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"SELECT value FROM {fq_table} WHERE id = 1") + first_read = cursor.fetchone()[0] + + # External connection modifies data + with sql.connect(**mst_conn_params) as ext_conn: + with ext_conn.cursor() as ext_cursor: + ext_cursor.execute( + f"UPDATE {fq_table} SET value = 'modified' WHERE id = 1" ) - # Reset to autocommit mode - try: - self.connection.autocommit = True - except Exception as e: - logger.debug(f"Reset autocommit during cleanup failed: {e}") - - # Drop test table - if self.connection and self.connection.open: - fq_table_name = self._get_fully_qualified_table_name() - cursor = self.connection.cursor() - try: - cursor.execute(f"DROP TABLE IF EXISTS {fq_table_name}") - logger.info(f"Dropped test table: {fq_table_name}") - except Exception as e: - logger.warning(f"Failed to drop test table: {e}") - finally: - cursor.close() + # Re-read in same txn — should see original value + with conn.cursor() as cursor: + cursor.execute(f"SELECT value FROM {fq_table} WHERE id = 1") + second_read = cursor.fetchone()[0] - finally: - # Close connection - if self.connection: - self.connection.close() - - # ==================== BASIC AUTOCOMMIT TESTS ==================== - - def test_default_autocommit_is_true(self): - """Test that new connection defaults to autocommit=true.""" - assert ( - self.connection.autocommit is True - ), "New connection should have autocommit=true by default" - - def test_set_autocommit_to_false(self): - """Test successfully setting autocommit to false.""" - self.connection.autocommit = False - assert ( - self.connection.autocommit is False - ), "autocommit should be false after setting to false" - - def test_set_autocommit_to_true(self): - """Test successfully setting autocommit back to true.""" - # First disable - self.connection.autocommit = False - assert self.connection.autocommit is False - - # Then enable - self.connection.autocommit = True - assert ( - self.connection.autocommit is True - ), "autocommit should be true after setting to true" - - # ==================== COMMIT TESTS ==================== - - def test_commit_single_insert(self): - """Test successfully committing a transaction with single INSERT.""" - fq_table_name = self._get_fully_qualified_table_name() - - # Start transaction - self.connection.autocommit = False - - # Insert data - cursor = self.connection.cursor() - cursor.execute( - f"INSERT INTO {fq_table_name} (id, value) VALUES (1, 'test_value')" - ) - cursor.close() + assert first_read == second_read, "Repeatable read: value should not change" + conn.rollback() - # Commit - self.connection.commit() + @pytest.mark.xdist_group(name="mst_write_conflict") + def test_write_conflict_single_table(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as setup_conn: + with setup_conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'initial')") - # Verify data is persisted using a new connection - verify_conn = sql.connect(**self.connection_params) + conn1 = sql.connect(**mst_conn_params) + conn2 = sql.connect(**mst_conn_params) try: - verify_cursor = verify_conn.cursor() - verify_cursor.execute(f"SELECT value FROM {fq_table_name} WHERE id = 1") - result = verify_cursor.fetchone() - verify_cursor.close() + conn1.autocommit = False + conn2.autocommit = False - assert result is not None, "Should find inserted row after commit" - assert result[0] == "test_value", "Value should match inserted value" - finally: - verify_conn.close() + with conn1.cursor() as c1: + c1.execute(f"UPDATE {fq_table} SET value = 'conn1' WHERE id = 1") + with conn2.cursor() as c2: + c2.execute(f"UPDATE {fq_table} SET value = 'conn2' WHERE id = 1") - def test_commit_multiple_inserts(self): - """Test successfully committing a transaction with multiple INSERTs.""" - fq_table_name = self._get_fully_qualified_table_name() - - self.connection.autocommit = False - - # Insert multiple rows - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (1, 'value1')") - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (2, 'value2')") - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (3, 'value3')") - cursor.close() - - self.connection.commit() - - # Verify all rows persisted - verify_conn = sql.connect(**self.connection_params) - try: - verify_cursor = verify_conn.cursor() - verify_cursor.execute(f"SELECT COUNT(*) FROM {fq_table_name}") - result = verify_cursor.fetchone() - verify_cursor.close() - - assert result[0] == 3, "Should have 3 rows after commit" + conn1.commit() + with pytest.raises(Exception): + conn2.commit() finally: - verify_conn.close() - - # ==================== ROLLBACK TESTS ==================== - - def test_rollback_single_insert(self): - """Test successfully rolling back a transaction.""" - fq_table_name = self._get_fully_qualified_table_name() - - self.connection.autocommit = False + try: + conn1.close() + except Exception: + pass + try: + conn2.close() + except Exception: + pass + + def test_read_only_transaction(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'existing')") + + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"SELECT COUNT(*) FROM {fq_table}") + assert cursor.fetchone()[0] == 1 + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 1 + + def test_rollback_after_query_failure(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'before_error')") + with pytest.raises(Exception): + cursor.execute("SELECT * FROM nonexistent_xyz_xyz") + conn.rollback() + + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (2, 'after_recovery')") + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 1 + + def test_multiple_cursors_in_txn(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as c1: + c1.execute(f"INSERT INTO {fq_table} VALUES (1, 'c1')") + with conn.cursor() as c2: + c2.execute(f"INSERT INTO {fq_table} VALUES (2, 'c2')") + with conn.cursor() as c3: + c3.execute(f"INSERT INTO {fq_table} VALUES (3, 'c3')") + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 3 + + def test_parameterized_insert(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute( + f"INSERT INTO {fq_table} VALUES (%(id)s, %(value)s)", + {"id": 1, "value": "parameterized"}, + ) + conn.commit() + + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"SELECT value FROM {fq_table} WHERE id = 1") + assert cursor.fetchone()[0] == "parameterized" + + def test_empty_transaction_rollback(self, mst_conn_params, mst_table): + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + # Rollback with no DML should not raise + conn.rollback() + + def test_close_connection_implicit_rollback(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + conn = sql.connect(**mst_conn_params) + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'pending')") + conn.close() + + assert _get_row_count(mst_conn_params, fq_table) == 0 + + +# ==================== B. API-SPECIFIC TESTS ==================== + + +class TestMstApi: + """DB-API-specific tests: autocommit, isolation, error handling.""" + + def test_default_autocommit_is_true(self, mst_conn_params): + with sql.connect(**mst_conn_params) as conn: + assert conn.autocommit is True + + def test_set_autocommit_false(self, mst_conn_params): + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + assert conn.autocommit is False + + def test_commit_without_active_txn_throws(self, mst_conn_params): + with sql.connect(**mst_conn_params) as conn: + with pytest.raises(Exception, match=r"NO_ACTIVE_TRANSACTION"): + conn.commit() + + def test_set_autocommit_during_active_txn_throws( + self, mst_conn_params, mst_table + ): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'active_txn')") + with pytest.raises(Exception): + conn.autocommit = True + conn.rollback() + + def test_supported_isolation_level(self, mst_conn_params): + with sql.connect(**mst_conn_params) as conn: + conn.set_transaction_isolation("REPEATABLE_READ") + assert conn.get_transaction_isolation() == "REPEATABLE_READ" + + def test_unsupported_isolation_level_rejected(self, mst_conn_params): + with sql.connect(**mst_conn_params) as conn: + for level in ["READ_UNCOMMITTED", "READ_COMMITTED", "SERIALIZABLE"]: + with pytest.raises(Exception): + conn.set_transaction_isolation(level) + + +# ==================== C. METADATA RPCs ==================== + + +class TestMstMetadata: + """Metadata RPCs inside active transactions. + + Python uses Thrift RPCs for cursor.columns, cursor.tables, etc. These + RPCs bypass MST context and return non-transactional data — they see + concurrent DDL changes that the transaction shouldn't see. + """ + + def test_cursor_columns_in_mst( + self, mst_conn_params, mst_table, mst_catalog, mst_schema + ): + fq_table, table_name = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'test')") + cursor.columns( + catalog_name=mst_catalog, schema_name=mst_schema, table_name=table_name + ) + columns = cursor.fetchall() + assert len(columns) > 0 + conn.rollback() + + def test_cursor_tables_in_mst( + self, mst_conn_params, mst_table, mst_catalog, mst_schema + ): + fq_table, table_name = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'test')") + cursor.tables( + catalog_name=mst_catalog, schema_name=mst_schema, table_name=table_name + ) + tables = cursor.fetchall() + assert len(tables) > 0 + conn.rollback() + + def test_cursor_schemas_in_mst(self, mst_conn_params, mst_table, mst_catalog): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'test')") + cursor.schemas(catalog_name=mst_catalog) + schemas = cursor.fetchall() + assert len(schemas) > 0 + conn.rollback() + + def test_cursor_catalogs_in_mst(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'test')") + cursor.catalogs() + catalogs = cursor.fetchall() + assert len(catalogs) > 0 + conn.rollback() + + @pytest.mark.xdist_group(name="mst_freshness_columns") + def test_cursor_columns_non_transactional_after_concurrent_ddl( + self, mst_conn_params, mst_table, mst_catalog, mst_schema + ): + """Thrift cursor.columns() bypasses MST — sees concurrent ALTER TABLE.""" + fq_table, table_name = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'test')") + cursor.columns( + catalog_name=mst_catalog, schema_name=mst_schema, table_name=table_name + ) + before_cols = {row[3].lower() for row in cursor.fetchall()} - # Insert data - cursor = self.connection.cursor() - cursor.execute( - f"INSERT INTO {fq_table_name} (id, value) VALUES (100, 'rollback_test')" - ) - cursor.close() + # External connection alters schema + with sql.connect(**mst_conn_params) as ext_conn: + with ext_conn.cursor() as ext_cursor: + ext_cursor.execute( + f"ALTER TABLE {fq_table} ADD COLUMN new_col STRING" + ) - # Rollback - self.connection.rollback() + # Re-read columns in same txn — Thrift RPC bypasses txn isolation, + # so new_col IS visible (proves non-transactional behavior) + with conn.cursor() as cursor: + cursor.columns( + catalog_name=mst_catalog, schema_name=mst_schema, table_name=table_name + ) + after_cols = {row[3].lower() for row in cursor.fetchall()} - # Verify data is NOT persisted - verify_conn = sql.connect(**self.connection_params) - try: - verify_cursor = verify_conn.cursor() - verify_cursor.execute( - f"SELECT COUNT(*) FROM {fq_table_name} WHERE id = 100" + assert "new_col" in after_cols, ( + "Thrift cursor.columns() should see concurrent DDL " + "(non-transactional behavior)" ) - result = verify_cursor.fetchone() - verify_cursor.close() - - assert result[0] == 0, "Rolled back data should not be persisted" - finally: - verify_conn.close() - - # ==================== SEQUENTIAL TRANSACTION TESTS ==================== - - def test_multiple_sequential_transactions(self): - """Test executing multiple sequential transactions (commit, commit, rollback).""" - fq_table_name = self._get_fully_qualified_table_name() - - self.connection.autocommit = False - - # First transaction - commit - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (1, 'txn1')") - cursor.close() - self.connection.commit() - - # Second transaction - commit - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (2, 'txn2')") - cursor.close() - self.connection.commit() - - # Third transaction - rollback - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (3, 'txn3')") - cursor.close() - self.connection.rollback() + assert before_cols != after_cols + conn.rollback() + + @pytest.mark.xdist_group(name="mst_freshness_tables") + def test_cursor_tables_non_transactional_after_concurrent_create( + self, mst_conn_params, mst_table, mst_catalog, mst_schema + ): + """Thrift cursor.tables() bypasses MST — sees concurrent CREATE TABLE.""" + fq_table, _ = mst_table + new_table_name = _unique_table_name_raw("freshness_new_tbl") + fq_new_table = f"{mst_catalog}.{mst_schema}.{new_table_name}" - # Verify only first two transactions persisted - verify_conn = sql.connect(**self.connection_params) try: - verify_cursor = verify_conn.cursor() - verify_cursor.execute( - f"SELECT COUNT(*) FROM {fq_table_name} WHERE id IN (1, 2)" - ) - result = verify_cursor.fetchone() - assert result[0] == 2, "Should have 2 committed rows" - - verify_cursor.execute(f"SELECT COUNT(*) FROM {fq_table_name} WHERE id = 3") - result = verify_cursor.fetchone() - assert result[0] == 0, "Rolled back row should not exist" - verify_cursor.close() + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'test')") + cursor.tables( + catalog_name=mst_catalog, + schema_name=mst_schema, + table_name=new_table_name, + ) + assert len(cursor.fetchall()) == 0 + + # External connection creates the table + with sql.connect(**mst_conn_params) as ext_conn: + with ext_conn.cursor() as ext_cursor: + ext_cursor.execute( + f"CREATE TABLE {fq_new_table} (id INT) USING DELTA " + f"TBLPROPERTIES ('delta.feature.catalogManaged' = 'supported')" + ) + + # Re-read in same txn — should see the new table + with conn.cursor() as cursor: + cursor.tables( + catalog_name=mst_catalog, + schema_name=mst_schema, + table_name=new_table_name, + ) + assert len(cursor.fetchall()) > 0, ( + "Thrift cursor.tables() should see concurrent CREATE TABLE " + "(non-transactional behavior)" + ) + conn.rollback() finally: - verify_conn.close() - - def test_auto_start_transaction_after_commit(self): - """Test that new transaction automatically starts after commit.""" - fq_table_name = self._get_fully_qualified_table_name() - - self.connection.autocommit = False + try: + with sql.connect(**mst_conn_params) as conn: + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {fq_new_table}") + except Exception as e: + logger.warning(f"Failed to drop {fq_new_table}: {e}") - # First transaction - commit - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (1, 'first')") - cursor.close() - self.connection.commit() - # New transaction should start automatically - insert and rollback - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (2, 'second')") - cursor.close() - self.connection.rollback() +# ==================== D. BLOCKED SQL (MSTCheckRule) ==================== - # Verify: first committed, second rolled back - verify_conn = sql.connect(**self.connection_params) - try: - verify_cursor = verify_conn.cursor() - verify_cursor.execute(f"SELECT COUNT(*) FROM {fq_table_name} WHERE id = 1") - result = verify_cursor.fetchone() - assert result[0] == 1, "First insert should be committed" - - verify_cursor.execute(f"SELECT COUNT(*) FROM {fq_table_name} WHERE id = 2") - result = verify_cursor.fetchone() - assert result[0] == 0, "Second insert should be rolled back" - verify_cursor.close() - finally: - verify_conn.close() - def test_auto_start_transaction_after_rollback(self): - """Test that new transaction automatically starts after rollback.""" - fq_table_name = self._get_fully_qualified_table_name() +class TestMstBlockedSql: + """SQL introspection statements inside active transactions. - self.connection.autocommit = False + The server restricts MST to a specific allowlist of commands. The error + message from TRANSACTION_NOT_SUPPORTED.COMMAND is explicit: + "Only SELECT / INSERT / MERGE / UPDATE / DELETE / DESCRIBE TABLE are supported." - # First transaction - rollback - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (1, 'first')") - cursor.close() - self.connection.rollback() + Blocked (throw + abort txn): + - SHOW COLUMNS, SHOW TABLES, SHOW SCHEMAS, SHOW CATALOGS, SHOW FUNCTIONS + - DESCRIBE QUERY, DESCRIBE TABLE EXTENDED + - SELECT FROM information_schema - # New transaction should start automatically - insert and commit - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (2, 'second')") - cursor.close() - self.connection.commit() + Allowed: + - DESCRIBE TABLE (basic form — explicitly listed in server's allowlist) + """ - # Verify: first rolled back, second committed - verify_conn = sql.connect(**self.connection_params) - try: - verify_cursor = verify_conn.cursor() - verify_cursor.execute(f"SELECT COUNT(*) FROM {fq_table_name} WHERE id = 1") - result = verify_cursor.fetchone() - assert result[0] == 0, "First insert should be rolled back" - - verify_cursor.execute(f"SELECT COUNT(*) FROM {fq_table_name} WHERE id = 2") - result = verify_cursor.fetchone() - assert result[0] == 1, "Second insert should be committed" - verify_cursor.close() - finally: - verify_conn.close() + def _assert_blocked_and_txn_aborted(self, mst_conn_params, fq_table, blocked_sql): + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'before_blocked')") - # ==================== UPDATE/DELETE OPERATION TESTS ==================== + with pytest.raises(Exception): + cursor.execute(blocked_sql) - def test_update_in_transaction(self): - """Test UPDATE operation in transaction.""" - fq_table_name = self._get_fully_qualified_table_name() - - # First insert a row with autocommit - cursor = self.connection.cursor() - cursor.execute( - f"INSERT INTO {fq_table_name} (id, value) VALUES (1, 'original')" + with pytest.raises(Exception): + cursor.execute( + f"INSERT INTO {fq_table} VALUES (2, 'after_blocked')" + ) + try: + conn.rollback() + except Exception: + pass + + def _assert_not_blocked(self, mst_conn_params, fq_table, allowed_sql): + """Assert the SQL succeeds and returns rows inside an active txn.""" + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.execute(f"INSERT INTO {fq_table} VALUES (1, 'before')") + cursor.execute(allowed_sql) + rows = cursor.fetchall() + assert len(rows) > 0 + conn.rollback() + + def test_show_tables_blocked(self, mst_conn_params, mst_table, mst_catalog, mst_schema): + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, fq_table, f"SHOW TABLES IN {mst_catalog}.{mst_schema}" ) - cursor.close() - # Start transaction and update - self.connection.autocommit = False - cursor = self.connection.cursor() - cursor.execute(f"UPDATE {fq_table_name} SET value = 'updated' WHERE id = 1") - cursor.close() - self.connection.commit() - - # Verify update persisted - verify_conn = sql.connect(**self.connection_params) - try: - verify_cursor = verify_conn.cursor() - verify_cursor.execute(f"SELECT value FROM {fq_table_name} WHERE id = 1") - result = verify_cursor.fetchone() - assert result[0] == "updated", "Value should be updated after commit" - verify_cursor.close() - finally: - verify_conn.close() - - # ==================== MULTI-TABLE TRANSACTION TESTS ==================== - - def test_multi_table_transaction_commit(self): - """Test atomic commit across multiple tables.""" - fq_table1_name = self._get_fully_qualified_table_name() - table2_name = self.TEST_TABLE_NAME + "_2" - fq_table2_name = f"{self.catalog}.{self.schema}.{table2_name}" - - # Create second table - cursor = self.connection.cursor() - cursor.execute(f"DROP TABLE IF EXISTS {fq_table2_name}") - cursor.execute( - f""" - CREATE TABLE IF NOT EXISTS {fq_table2_name} - (id INT, category STRING) - USING DELTA - TBLPROPERTIES ('delta.feature.catalogOwned-preview' = 'supported') - """ + def test_show_schemas_blocked(self, mst_conn_params, mst_table, mst_catalog): + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, fq_table, f"SHOW SCHEMAS IN {mst_catalog}" ) - cursor.close() - try: - # Start transaction and insert into both tables - self.connection.autocommit = False - - cursor = self.connection.cursor() - cursor.execute( - f"INSERT INTO {fq_table1_name} (id, value) VALUES (10, 'table1_data')" - ) - cursor.execute( - f"INSERT INTO {fq_table2_name} (id, category) VALUES (10, 'table2_data')" - ) - cursor.close() + def test_show_catalogs_blocked(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, fq_table, "SHOW CATALOGS" + ) - # Commit both atomically - self.connection.commit() + def test_show_functions_blocked(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, fq_table, "SHOW FUNCTIONS" + ) - # Verify both inserts persisted - verify_conn = sql.connect(**self.connection_params) - try: - verify_cursor = verify_conn.cursor() + def test_describe_table_extended_blocked(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, fq_table, f"DESCRIBE TABLE EXTENDED {fq_table}" + ) - verify_cursor.execute( - f"SELECT COUNT(*) FROM {fq_table1_name} WHERE id = 10" - ) - result = verify_cursor.fetchone() - assert result[0] == 1, "Table1 insert should be committed" + def test_information_schema_blocked(self, mst_conn_params, mst_table, mst_catalog): + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, + fq_table, + f"SELECT * FROM {mst_catalog}.information_schema.columns LIMIT 1", + ) - verify_cursor.execute( - f"SELECT COUNT(*) FROM {fq_table2_name} WHERE id = 10" - ) - result = verify_cursor.fetchone() - assert result[0] == 1, "Table2 insert should be committed" + def test_show_columns_blocked(self, mst_conn_params, mst_table): + """SHOW COLUMNS is blocked in MST (ShowDeltaTableColumnsCommand).""" + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, fq_table, f"SHOW COLUMNS IN {fq_table}" + ) - verify_cursor.close() - finally: - verify_conn.close() + def test_describe_query_blocked(self, mst_conn_params, mst_table): + """DESCRIBE QUERY is blocked in MST (DescribeQueryCommand).""" + fq_table, _ = mst_table + self._assert_blocked_and_txn_aborted( + mst_conn_params, + fq_table, + f"DESCRIBE QUERY SELECT * FROM {fq_table}", + ) - finally: - # Cleanup second table - self.connection.autocommit = True - cursor = self.connection.cursor() - cursor.execute(f"DROP TABLE IF EXISTS {fq_table2_name}") - cursor.close() - - def test_multi_table_transaction_rollback(self): - """Test atomic rollback across multiple tables.""" - fq_table1_name = self._get_fully_qualified_table_name() - table2_name = self.TEST_TABLE_NAME + "_2" - fq_table2_name = f"{self.catalog}.{self.schema}.{table2_name}" - - # Create second table - cursor = self.connection.cursor() - cursor.execute(f"DROP TABLE IF EXISTS {fq_table2_name}") - cursor.execute( - f""" - CREATE TABLE IF NOT EXISTS {fq_table2_name} - (id INT, category STRING) - USING DELTA - TBLPROPERTIES ('delta.feature.catalogOwned-preview' = 'supported') - """ + # DESCRIBE TABLE is explicitly listed as an allowed command in the server's + # TRANSACTION_NOT_SUPPORTED.COMMAND error message: + # "Only SELECT / INSERT / MERGE / UPDATE / DELETE / DESCRIBE TABLE are supported." + def test_describe_table_not_blocked(self, mst_conn_params, mst_table): + """DESCRIBE TABLE succeeds in MST — explicitly allowed by the server.""" + fq_table, _ = mst_table + self._assert_not_blocked( + mst_conn_params, fq_table, f"DESCRIBE TABLE {fq_table}" ) - cursor.close() - try: - # Start transaction and insert into both tables - self.connection.autocommit = False - cursor = self.connection.cursor() - cursor.execute( - f"INSERT INTO {fq_table1_name} (id, value) VALUES (20, 'rollback1')" - ) - cursor.execute( - f"INSERT INTO {fq_table2_name} (id, category) VALUES (20, 'rollback2')" - ) - cursor.close() +# ==================== E. EXECUTE VARIANTS ==================== - # Rollback both atomically - self.connection.rollback() - # Verify both inserts were rolled back - verify_conn = sql.connect(**self.connection_params) - try: - verify_cursor = verify_conn.cursor() +class TestMstExecuteVariants: + """Execute method variants (executemany) inside MST.""" - verify_cursor.execute( - f"SELECT COUNT(*) FROM {fq_table1_name} WHERE id = 20" + def test_executemany_in_txn(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.executemany( + f"INSERT INTO {fq_table} VALUES (%(id)s, %(value)s)", + [ + {"id": 1, "value": "a"}, + {"id": 2, "value": "b"}, + {"id": 3, "value": "c"}, + ], ) - result = verify_cursor.fetchone() - assert result[0] == 0, "Table1 insert should be rolled back" - - verify_cursor.execute( - f"SELECT COUNT(*) FROM {fq_table2_name} WHERE id = 20" + conn.commit() + + assert _get_row_count(mst_conn_params, fq_table) == 3 + + def test_executemany_rollback_in_txn(self, mst_conn_params, mst_table): + fq_table, _ = mst_table + with sql.connect(**mst_conn_params) as conn: + conn.autocommit = False + with conn.cursor() as cursor: + cursor.executemany( + f"INSERT INTO {fq_table} VALUES (%(id)s, %(value)s)", + [{"id": 1, "value": "a"}, {"id": 2, "value": "b"}], ) - result = verify_cursor.fetchone() - assert result[0] == 0, "Table2 insert should be rolled back" + conn.rollback() - verify_cursor.close() - finally: - verify_conn.close() - - finally: - # Cleanup second table - self.connection.autocommit = True - cursor = self.connection.cursor() - cursor.execute(f"DROP TABLE IF EXISTS {fq_table2_name}") - cursor.close() - - # ==================== ERROR HANDLING TESTS ==================== - - def test_set_autocommit_during_active_transaction(self): - """Test that setting autocommit during an active transaction throws error.""" - fq_table_name = self._get_fully_qualified_table_name() - - # Start transaction - self.connection.autocommit = False - cursor = self.connection.cursor() - cursor.execute(f"INSERT INTO {fq_table_name} (id, value) VALUES (99, 'test')") - cursor.close() - - # Try to set autocommit=True during active transaction - with pytest.raises(TransactionError) as exc_info: - self.connection.autocommit = True - - # Verify error message mentions autocommit or active transaction - error_msg = str(exc_info.value).lower() - assert ( - "autocommit" in error_msg or "active transaction" in error_msg - ), "Error should mention autocommit or active transaction" - - # Cleanup - rollback the transaction - self.connection.rollback() - - def test_commit_without_active_transaction_throws_error(self): - """Test that commit() throws error when autocommit=true (no active transaction).""" - # Ensure autocommit is true (default) - assert self.connection.autocommit is True - - # Attempt commit without active transaction should throw - with pytest.raises(TransactionError) as exc_info: - self.connection.commit() - - # Verify error message indicates no active transaction - error_message = str(exc_info.value) - assert ( - "MULTI_STATEMENT_TRANSACTION_NO_ACTIVE_TRANSACTION" in error_message - or "no active transaction" in error_message.lower() - ), "Error should indicate no active transaction" - - def test_rollback_without_active_transaction_is_safe(self): - """Test that rollback() without active transaction is a safe no-op.""" - # With autocommit=true (no active transaction) - assert self.connection.autocommit is True - - # ROLLBACK should be safe (no exception) - self.connection.rollback() - - # Verify connection is still usable - assert self.connection.autocommit is True - assert self.connection.open is True - - # ==================== TRANSACTION ISOLATION TESTS ==================== - - def test_get_transaction_isolation_returns_repeatable_read(self): - """Test that get_transaction_isolation() returns REPEATABLE_READ.""" - isolation_level = self.connection.get_transaction_isolation() - assert ( - isolation_level == "REPEATABLE_READ" - ), "Databricks MST should use REPEATABLE_READ (Snapshot Isolation)" - - def test_set_transaction_isolation_accepts_repeatable_read(self): - """Test that set_transaction_isolation() accepts REPEATABLE_READ.""" - # Should not raise - these are all valid formats - self.connection.set_transaction_isolation("REPEATABLE_READ") - self.connection.set_transaction_isolation("REPEATABLE READ") - self.connection.set_transaction_isolation("repeatable_read") - self.connection.set_transaction_isolation("repeatable read") - - def test_set_transaction_isolation_rejects_unsupported_level(self): - """Test that set_transaction_isolation() rejects unsupported levels.""" - with pytest.raises(NotSupportedError) as exc_info: - self.connection.set_transaction_isolation("READ_COMMITTED") - - error_message = str(exc_info.value) - assert "not supported" in error_message.lower() - assert "READ_COMMITTED" in error_message + assert _get_row_count(mst_conn_params, fq_table) == 0 From ff921ed2d48a35ac5027c845efc11c90a7c7b7b1 Mon Sep 17 00:00:00 2001 From: Madhavendra Rathore Date: Tue, 21 Apr 2026 21:19:39 +0530 Subject: [PATCH 22/24] Add SPOG routing support for account-level vanity URLs (#767) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add SPOG routing support for account-level vanity URLs SPOG replaces per-workspace hostnames with account-level URLs. When httpPath contains ?o=, the connector now extracts the workspace ID and injects x-databricks-org-id as an HTTP header on all non-OAuth endpoints (SEA, telemetry, feature flags). Changes: - Fix warehouse ID regex to stop at query params ([^?&]+ instead of .+) - Extract ?o= from httpPath once during session init, store as _spog_headers - Propagate org-id header to telemetry client via extra_headers param - Propagate org-id header to feature flags client - Do NOT propagate to OAuth endpoints (they reject it with 400) Signed-off-by: Madhavendra Rathore Co-authored-by: Isaac Signed-off-by: Madhavendra Rathore * Add debug logging for SPOG x-databricks-org-id header extraction Mirrors the JDBC driver's logging pattern. Emits at DEBUG level in three code paths of _extract_spog_headers: 1. http_path has a query string but no ?o= param — log and skip. 2. x-databricks-org-id already set by the caller (via http_headers) — log and skip (don't override explicit user header). 3. Injection happens — log the extracted workspace ID so customers diagnosing SPOG routing can confirm the header was added. Helps with customer support: when a customer reports "SPOG isn't routing correctly", they can enable DEBUG logging and immediately see whether the connector saw their ?o= value. Signed-off-by: Madhavendra Rathore Signed-off-by: Madhavendra Rathore --------- Signed-off-by: Madhavendra Rathore Signed-off-by: Madhavendra Rathore --- src/databricks/sql/backend/sea/backend.py | 5 +- src/databricks/sql/client.py | 1 + src/databricks/sql/common/feature_flag.py | 1 + src/databricks/sql/session.py | 46 +++++++++++++++++++ .../sql/telemetry/telemetry_client.py | 6 +++ tests/unit/test_sea_backend.py | 33 +++++++++++++ tests/unit/test_session.py | 44 ++++++++++++++++++ 7 files changed, 134 insertions(+), 2 deletions(-) diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py index ff130cd39..04c79a18b 100644 --- a/src/databricks/sql/backend/sea/backend.py +++ b/src/databricks/sql/backend/sea/backend.py @@ -188,8 +188,9 @@ def _extract_warehouse_id(self, http_path: str) -> str: ValueError: If the warehouse ID cannot be extracted from the path """ - warehouse_pattern = re.compile(r".*/warehouses/(.+)") - endpoint_pattern = re.compile(r".*/endpoints/(.+)") + # [^?&]+ stops at query params (e.g. ?o= for SPOG routing) + warehouse_pattern = re.compile(r".*/warehouses/([^?&]+)") + endpoint_pattern = re.compile(r".*/endpoints/([^?&]+)") for pattern in [warehouse_pattern, endpoint_pattern]: match = pattern.match(http_path) diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py index 2aeea175e..fe52f0c79 100755 --- a/src/databricks/sql/client.py +++ b/src/databricks/sql/client.py @@ -353,6 +353,7 @@ def read(self) -> Optional[OAuthToken]: host_url=self.session.host, batch_size=self.telemetry_batch_size, client_context=client_context, + extra_headers=self.session.get_spog_headers(), ) self._telemetry_client = TelemetryClientFactory.get_telemetry_client( diff --git a/src/databricks/sql/common/feature_flag.py b/src/databricks/sql/common/feature_flag.py index 36e4b8a02..0b2c7490b 100644 --- a/src/databricks/sql/common/feature_flag.py +++ b/src/databricks/sql/common/feature_flag.py @@ -113,6 +113,7 @@ def _refresh_flags(self): # Authenticate the request self._connection.session.auth_provider.add_headers(headers) headers["User-Agent"] = self._connection.session.useragent_header + headers.update(self._connection.session.get_spog_headers()) response = self._http_client.request( HttpMethod.GET, self._feature_flag_endpoint, headers=headers, timeout=30 diff --git a/src/databricks/sql/session.py b/src/databricks/sql/session.py index 1588d9f79..65c0d6aca 100644 --- a/src/databricks/sql/session.py +++ b/src/databricks/sql/session.py @@ -72,6 +72,14 @@ def __init__( base_headers = [("User-Agent", self.useragent_header)] all_headers = (http_headers or []) + base_headers + # Extract ?o= from http_path for SPOG routing. + # On SPOG hosts, the httpPath contains ?o= which routes Thrift + # requests via the URL. For SEA, telemetry, and feature flags (which use + # separate endpoints), we inject x-databricks-org-id as an HTTP header. + self._spog_headers = self._extract_spog_headers(http_path, all_headers) + if self._spog_headers: + all_headers = all_headers + list(self._spog_headers.items()) + self.ssl_options = SSLOptions( # Double negation is generally a bad thing, but we have to keep backward compatibility tls_verify=not kwargs.get( @@ -136,6 +144,44 @@ def _create_backend( } return databricks_client_class(**common_args) + @staticmethod + def _extract_spog_headers(http_path, existing_headers): + """Extract ?o= from http_path and return as a header dict for SPOG routing.""" + if not http_path or "?" not in http_path: + return {} + + from urllib.parse import parse_qs + + query_string = http_path.split("?", 1)[1] + params = parse_qs(query_string) + org_id = params.get("o", [None])[0] + if not org_id: + logger.debug( + "SPOG header extraction: http_path has query string but no ?o= param, " + "skipping x-databricks-org-id injection" + ) + return {} + + # Don't override if explicitly set + if any(k == "x-databricks-org-id" for k, _ in existing_headers): + logger.debug( + "SPOG header extraction: x-databricks-org-id already set by caller, " + "not overriding with ?o=%s from http_path", + org_id, + ) + return {} + + logger.debug( + "SPOG header extraction: injecting x-databricks-org-id=%s " + "(extracted from ?o= in http_path)", + org_id, + ) + return {"x-databricks-org-id": org_id} + + def get_spog_headers(self): + """Returns SPOG routing headers (x-databricks-org-id) if ?o= was in http_path.""" + return dict(self._spog_headers) + def open(self): self._session_id = self.backend.open_session( session_configuration=self.session_configuration, diff --git a/src/databricks/sql/telemetry/telemetry_client.py b/src/databricks/sql/telemetry/telemetry_client.py index 408162400..55d845e46 100644 --- a/src/databricks/sql/telemetry/telemetry_client.py +++ b/src/databricks/sql/telemetry/telemetry_client.py @@ -188,6 +188,7 @@ def __init__( executor, batch_size: int, client_context, + extra_headers: Optional[Dict[str, str]] = None, ) -> None: logger.debug("Initializing TelemetryClient for connection: %s", session_id_hex) self._telemetry_enabled = telemetry_enabled @@ -195,6 +196,7 @@ def __init__( self._session_id_hex = session_id_hex self._auth_provider = auth_provider self._user_agent = None + self._extra_headers = extra_headers or {} # OPTIMIZATION: Use lock-free Queue instead of list + lock # Queue is thread-safe internally and has better performance under concurrency @@ -287,6 +289,8 @@ def _send_telemetry(self, events): if self._auth_provider: self._auth_provider.add_headers(headers) + headers.update(self._extra_headers) + try: logger.debug("Submitting telemetry request to thread pool") @@ -587,6 +591,7 @@ def initialize_telemetry_client( host_url, batch_size, client_context, + extra_headers=None, ): """ Initialize a telemetry client for a specific connection if telemetry is enabled. @@ -627,6 +632,7 @@ def initialize_telemetry_client( executor=TelemetryClientFactory._executor, batch_size=batch_size, client_context=client_context, + extra_headers=extra_headers, ) TelemetryClientFactory._clients[ host_url diff --git a/tests/unit/test_sea_backend.py b/tests/unit/test_sea_backend.py index f71e60943..24a5e8242 100644 --- a/tests/unit/test_sea_backend.py +++ b/tests/unit/test_sea_backend.py @@ -143,6 +143,39 @@ def test_initialization(self, mock_http_client): ) assert client2.warehouse_id == "def456" + # Test with SPOG query param ?o= in http_path + client_spog = SeaDatabricksClient( + server_hostname="test-server.databricks.com", + port=443, + http_path="/sql/1.0/warehouses/abc123?o=6051921418418893", + http_headers=[], + auth_provider=AuthProvider(), + ssl_options=SSLOptions(), + ) + assert client_spog.warehouse_id == "abc123" + + # Test with SPOG query param on endpoints path + client_spog_ep = SeaDatabricksClient( + server_hostname="test-server.databricks.com", + port=443, + http_path="/sql/1.0/endpoints/def456?o=6051921418418893", + http_headers=[], + auth_provider=AuthProvider(), + ssl_options=SSLOptions(), + ) + assert client_spog_ep.warehouse_id == "def456" + + # Test with multiple query params + client_spog_multi = SeaDatabricksClient( + server_hostname="test-server.databricks.com", + port=443, + http_path="/sql/1.0/warehouses/abc123?o=123&extra=val", + http_headers=[], + auth_provider=AuthProvider(), + ssl_options=SSLOptions(), + ) + assert client_spog_multi.warehouse_id == "abc123" + # Test with custom max_download_threads client3 = SeaDatabricksClient( server_hostname="test-server.databricks.com", diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index aa7e7f02b..136c99e53 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -8,6 +8,7 @@ THandleIdentifier, ) from databricks.sql.backend.types import SessionId, BackendType +from databricks.sql.session import Session import databricks.sql @@ -226,3 +227,46 @@ def test_query_tags_dict_takes_precedence_over_session_config(self, mock_client_ call_kwargs = mock_client_class.return_value.open_session.call_args[1] assert call_kwargs["session_configuration"]["QUERY_TAGS"] == "team:new-team" + + +class TestSpogHeaders: + """Unit tests for SPOG header extraction from http_path.""" + + def test_extracts_org_id_from_query_param(self): + result = Session._extract_spog_headers( + "/sql/1.0/warehouses/abc123?o=6051921418418893", [] + ) + assert result == {"x-databricks-org-id": "6051921418418893"} + + def test_no_query_param_returns_empty(self): + result = Session._extract_spog_headers( + "/sql/1.0/warehouses/abc123", [] + ) + assert result == {} + + def test_no_o_param_returns_empty(self): + result = Session._extract_spog_headers( + "/sql/1.0/warehouses/abc123?other=value", [] + ) + assert result == {} + + def test_empty_http_path_returns_empty(self): + result = Session._extract_spog_headers("", []) + assert result == {} + + def test_none_http_path_returns_empty(self): + result = Session._extract_spog_headers(None, []) + assert result == {} + + def test_explicit_header_takes_precedence(self): + existing = [("x-databricks-org-id", "explicit-value")] + result = Session._extract_spog_headers( + "/sql/1.0/warehouses/abc123?o=6051921418418893", existing + ) + assert result == {} + + def test_multiple_query_params(self): + result = Session._extract_spog_headers( + "/sql/1.0/warehouses/abc123?o=12345&extra=val", [] + ) + assert result == {"x-databricks-org-id": "12345"} From 2926daab0803304c47c5a9a850a021d0fa67bd52 Mon Sep 17 00:00:00 2001 From: Madhavendra Rathore Date: Thu, 23 Apr 2026 11:55:39 +0530 Subject: [PATCH 23/24] Bump to version 4.2.6 (#777) Signed-off-by: Madhavendra Rathore --- CHANGELOG.md | 11 +++++++++++ pyproject.toml | 2 +- src/databricks/sql/__init__.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ba3bb1a8..fc89750d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Release History +# 4.2.6 (2026-04-22) +- Add SPOG routing support for account-level vanity URLs (databricks/databricks-sql-python#767 by @msrathore-db) +- Fix dependency_manager: handle PEP 440 ~= compatible release syntax (databricks/databricks-sql-python#776 by @vikrantpuppala) +- Bump thrift to fix deprecation warning (databricks/databricks-sql-python#733 by @Korijn) +- Add AI coding agent detection to User-Agent header (databricks/databricks-sql-python#740 by @vikrantpuppala) +- Add statement-level query_tags support for SEA backend (databricks/databricks-sql-python#754 by @sreekanth-db) +- Update PyArrow concatenation of tables to use promote_options as default (databricks/databricks-sql-python#751 by @jprakash-db) +- Fix float inference to use DoubleParameter (64-bit) instead of FloatParameter (databricks/databricks-sql-python#742 by @Shubhambhusate) +- Allow specifying query_tags as a dict upon connection creation (databricks/databricks-sql-python#749 by @jiabin-hu) +- Add query_tags parameter support for execute methods (databricks/databricks-sql-python#736 by @jiabin-hu) + # 4.2.5 (2026-02-09) - Fix feature-flag endpoint retries in gov region (databricks/databricks-sql-python#735 by @samikshya-db) - Improve telemetry lifecycle management (databricks/databricks-sql-python#734 by @msrathore-db) diff --git a/pyproject.toml b/pyproject.toml index e1ce3b73f..5e9f7f0ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "databricks-sql-connector" -version = "4.2.5" +version = "4.2.6" description = "Databricks SQL Connector for Python" authors = ["Databricks "] license = "Apache-2.0" diff --git a/src/databricks/sql/__init__.py b/src/databricks/sql/__init__.py index c9195b89f..493ffe3a2 100644 --- a/src/databricks/sql/__init__.py +++ b/src/databricks/sql/__init__.py @@ -71,7 +71,7 @@ def __repr__(self): DATE = DBAPITypeObject("date") ROWID = DBAPITypeObject() -__version__ = "4.2.5" +__version__ = "4.2.6" USER_AGENT_NAME = "PyDatabricksSqlConnector" # These two functions are pyhive legacy From ee63b811a3c71961495450ffee0a3bd5ad856466 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Thu, 23 Apr 2026 15:37:14 +0530 Subject: [PATCH 24/24] [PECOBLR-2461] Fix test_show_columns_blocked: SHOW COLUMNS now allowed in MST (#778) The server's MSTCheckRule allowlist has been broadened to include SHOW COLUMNS (ShowDeltaTableColumnsCommand). Flip the test to assert SHOW COLUMNS succeeds inside an MST transaction, matching the pattern already used by test_describe_table_not_blocked. Other SHOW variants (SHOW SCHEMAS/TABLES/CATALOGS/FUNCTIONS), DESCRIBE QUERY, DESCRIBE TABLE EXTENDED, and information_schema remain blocked as expected. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- tests/e2e/test_transactions.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/e2e/test_transactions.py b/tests/e2e/test_transactions.py index 9bf74baad..4fb7918b9 100644 --- a/tests/e2e/test_transactions.py +++ b/tests/e2e/test_transactions.py @@ -624,17 +624,21 @@ def test_cursor_tables_non_transactional_after_concurrent_create( class TestMstBlockedSql: """SQL introspection statements inside active transactions. - The server restricts MST to a specific allowlist of commands. The error - message from TRANSACTION_NOT_SUPPORTED.COMMAND is explicit: + The server restricts MST to an allowlist enforced by MSTCheckRule. The + TRANSACTION_NOT_SUPPORTED.COMMAND error originally advertised only: "Only SELECT / INSERT / MERGE / UPDATE / DELETE / DESCRIBE TABLE are supported." + The server has since broadened the allowlist to include SHOW COLUMNS + (ShowDeltaTableColumnsCommand), observed on current DBSQL warehouses. + Blocked (throw + abort txn): - - SHOW COLUMNS, SHOW TABLES, SHOW SCHEMAS, SHOW CATALOGS, SHOW FUNCTIONS + - SHOW TABLES, SHOW SCHEMAS, SHOW CATALOGS, SHOW FUNCTIONS - DESCRIBE QUERY, DESCRIBE TABLE EXTENDED - SELECT FROM information_schema Allowed: - - DESCRIBE TABLE (basic form — explicitly listed in server's allowlist) + - DESCRIBE TABLE (basic form) + - SHOW COLUMNS """ def _assert_blocked_and_txn_aborted(self, mst_conn_params, fq_table, blocked_sql): @@ -704,10 +708,10 @@ def test_information_schema_blocked(self, mst_conn_params, mst_table, mst_catalo f"SELECT * FROM {mst_catalog}.information_schema.columns LIMIT 1", ) - def test_show_columns_blocked(self, mst_conn_params, mst_table): - """SHOW COLUMNS is blocked in MST (ShowDeltaTableColumnsCommand).""" + def test_show_columns_not_blocked(self, mst_conn_params, mst_table): + """SHOW COLUMNS succeeds in MST — allowed by the server's MSTCheckRule allowlist.""" fq_table, _ = mst_table - self._assert_blocked_and_txn_aborted( + self._assert_not_blocked( mst_conn_params, fq_table, f"SHOW COLUMNS IN {fq_table}" )