fix(bigquery): Enable list inference for parquet loads in offline_write_batch

Jwrede · rpathade · commit de050abdba5e · 2026-05-20T20:37:09.000-07:00
When pushing features with array/list types (e.g. STRING_LIST) to BigQuery via offline_write_batch, the data arrives as empty arrays because BigQuery's parquet loader does not infer list structure by default. Set parquet_options.enable_list_inference = True on the LoadJobConfig so array columns are written correctly. Fixes feast-dev#5845 Signed-off-by: Jonathan Wrede <wrede.jonathan00@gmail.com> Signed-off-by: RutujaPathade <73137503+RutujaPathade@users.noreply.github.com>
diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py
@@ -434,11 +434,15 @@ def offline_write_batch(
             location=config.offline_store.location,
         )
 
+        parquet_options = bigquery.ParquetOptions()
+        parquet_options.enable_list_inference = True
+
         job_config = bigquery.LoadJobConfig(
             source_format=bigquery.SourceFormat.PARQUET,
             schema=arrow_schema_to_bq_schema(pa_schema),
             create_disposition=config.offline_store.table_create_disposition,
             write_disposition="WRITE_APPEND",  # Default but included for clarity
+            parquet_options=parquet_options,
         )
 
         with tempfile.TemporaryFile() as parquet_temp_file:
diff --git a/sdk/python/tests/unit/infra/offline_stores/test_bigquery.py b/sdk/python/tests/unit/infra/offline_stores/test_bigquery.py
@@ -200,3 +200,64 @@ def test_table_property_unaffected_by_query_priority(self):
             timestamp_field="ts",
         )
         assert source.table == "project.dataset.write_target"
+
+
+class TestOfflineWriteBatch:
+    @patch("feast.infra.offline_stores.bigquery._get_bigquery_client")
+    def test_offline_write_batch_enables_list_inference(self, mock_get_client):
+        """LoadJobConfig must set parquet_options.enable_list_inference = True
+        so that BigQuery correctly interprets PyArrow list columns from parquet.
+        """
+        from unittest.mock import MagicMock
+
+        source = BigQuerySource(
+            name="test",
+            table="project.dataset.table",
+            timestamp_field="ts",
+        )
+        fv = MagicMock()
+        fv.batch_source = source
+
+        pa_schema = pyarrow.schema(
+            [
+                pyarrow.field("entity_id", pyarrow.string()),
+                pyarrow.field("tags", pyarrow.list_(pyarrow.string())),
+                pyarrow.field("ts", pyarrow.timestamp("us", tz="UTC")),
+            ]
+        )
+        pa_table = pyarrow.table(
+            {
+                "entity_id": ["e1"],
+                "tags": [["a", "b"]],
+                "ts": [datetime(2024, 1, 1, tzinfo=timezone.utc)],
+            },
+            schema=pa_schema,
+        )
+
+        mock_client = MagicMock()
+        mock_get_client.return_value = mock_client
+        mock_client.load_table_from_file.return_value = MagicMock()
+
+        config = RepoConfig(
+            registry="gs://test/registry.db",
+            project="test",
+            provider="gcp",
+            offline_store=BigQueryOfflineStoreConfig(project_id="test-project"),
+            online_store=SqliteOnlineStoreConfig(),
+        )
+
+        with patch(
+            "feast.infra.offline_stores.offline_utils.get_pyarrow_schema_from_batch_source",
+            return_value=(pa_schema, pa_table.column_names),
+        ):
+            BigQueryOfflineStore.offline_write_batch(
+                config=config,
+                feature_view=fv,
+                table=pa_table,
+                progress=None,
+            )
+
+        call_kwargs = mock_client.load_table_from_file.call_args
+        job_config = call_kwargs[1]["job_config"]
+        assert job_config.parquet_options is not None
+        assert job_config.parquet_options.enable_list_inference is True

Original file line number	Diff line number	Diff line change
`@@ -434,11 +434,15 @@ def offline_write_batch(`
`434`	`434`	`location=config.offline_store.location,`
`435`	`435`	`)`
`436`	`436`
	`437`	`+ parquet_options = bigquery.ParquetOptions()`
	`438`	`+ parquet_options.enable_list_inference = True`
	`439`	`+`
`437`	`440`	`job_config = bigquery.LoadJobConfig(`
`438`	`441`	`source_format=bigquery.SourceFormat.PARQUET,`
`439`	`442`	`schema=arrow_schema_to_bq_schema(pa_schema),`
`440`	`443`	`create_disposition=config.offline_store.table_create_disposition,`
`441`	`444`	`write_disposition="WRITE_APPEND", # Default but included for clarity`
	`445`	`+ parquet_options=parquet_options,`
`442`	`446`	`)`
`443`	`447`
`444`	`448`	`with tempfile.TemporaryFile() as parquet_temp_file:`