From 3137caf0d671e34fcd69b27f091642125d2f3a62 Mon Sep 17 00:00:00 2001 From: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:53:23 +0000 Subject: [PATCH 01/19] types: resolve env vars in connector cfg Fixes: #5799 Follows the same scheme as the kubernetes secret resolver. To resolve an env var successfully, the string must start with: `${env:` and end with `}`. The env var name then has to start from a valid alphabet, or underscore. The name then may contain numbers. Example: `"table": "${env:TABLE_NAME}"` Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> --- crates/feldera-types/src/secret_ref.rs | 148 +++++++++++++++++++- crates/feldera-types/src/secret_resolver.rs | 131 ++++++++++++++++- 2 files changed, 269 insertions(+), 10 deletions(-) diff --git a/crates/feldera-types/src/secret_ref.rs b/crates/feldera-types/src/secret_ref.rs index 6c9bcbaf12f..b06ddd2b4e7 100644 --- a/crates/feldera-types/src/secret_ref.rs +++ b/crates/feldera-types/src/secret_ref.rs @@ -6,6 +6,33 @@ use thiserror::Error as ThisError; /// RFC 1123 specification for a DNS label, which is also used by Kubernetes. pub const PATTERN_RFC_1123_DNS_LABEL: &str = r"^[a-z0-9]+(-[a-z0-9]+)*$"; +/// POSIX pattern for an environment variable name. +pub const PATTERN_ENV_VAR_NAME: &str = r"^[a-zA-Z_][a-zA-Z0-9_]*$"; + +#[derive(Debug, Clone, PartialEq, Eq, ThisError)] +pub enum EnvVarNameParseError { + #[error("cannot be empty")] + Empty, + #[error( + "must only contain alphanumeric characters and underscores (_), and start with a letter or underscore" + )] + InvalidFormat, +} + +/// Validates it is a valid POSIX environment variable name. +pub fn validate_env_var_name(name: &str) -> Result<(), EnvVarNameParseError> { + if name.is_empty() { + Err(EnvVarNameParseError::Empty) + } else { + let re = Regex::new(PATTERN_ENV_VAR_NAME).expect("valid regular expression"); + if re.is_match(name) { + Ok(()) + } else { + Err(EnvVarNameParseError::InvalidFormat) + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, ThisError)] pub enum KubernetesSecretNameParseError { #[error("cannot be empty")] @@ -90,6 +117,11 @@ pub enum SecretRef { /// Key inside the `data:` section of the `Secret` object. data_key: String, }, + /// Reference to a process environment variable. + EnvVar { + /// Name of the environment variable. + name: String, + }, } impl Display for SecretRef { @@ -98,6 +130,9 @@ impl Display for SecretRef { SecretRef::Kubernetes { name, data_key } => { write!(f, "${{secret:kubernetes:{name}/{data_key}}}") } + SecretRef::EnvVar { name } => { + write!(f, "${{env:{name}}}") + } } } } @@ -134,26 +169,45 @@ pub enum MaybeSecretRefParseError { data_key: String, e: KubernetesSecretDataKeyParseError, }, + #[error( + "environment variable reference '{env_ref_str}' has name '{name}' which is not valid: {e}" + )] + InvalidEnvVarName { + env_ref_str: String, + name: String, + e: EnvVarNameParseError, + }, + #[error( + "environment variable reference '{env_ref_str}' is not valid: name cannot be empty" + )] + EmptyEnvVarName { env_ref_str: String }, } impl MaybeSecretRef { - /// Determines whether a string is just a plain string or a reference to a secret. + /// Determines whether a string is just a plain string, a reference to a secret, + /// or a reference to a process environment variable. /// /// - Secret reference: any string which starts with `${secret:` and ends with `}` /// is regarded as an attempt to declare a secret reference + /// - Environment variable reference: any string which starts with `${env:` and ends with `}` + /// is regarded as an attempt to declare an environment variable reference /// - Plain string: any other string /// /// A secret reference must follow the following pattern: /// `${secret::}` /// - /// An error is returned if a string is regarded as a secret reference (see above), but: - /// - Specifies a `` which does not exist - /// - Specifies a `` which does not meet the provider-specific requirements + /// An environment variable reference must follow the following pattern: + /// `${env:}` + /// + /// An error is returned if a string is regarded as a secret or env var reference (see above), but: + /// - Specifies a `` which does not exist (for secret refs) + /// - Specifies a `` which does not meet the requirements /// /// Supported providers and their identifier expectations: /// - `${secret:kubernetes:/}` + /// - `${env:}` where `` follows POSIX env var naming rules /// - /// Note that here is not checked whether the secret reference can actually be resolved. + /// Note that here is not checked whether the reference can actually be resolved. pub fn new(value: String) -> Result { if value.starts_with("${secret:") && value.ends_with('}') { // Because the pattern only has ASCII characters, they are encoded as single bytes. @@ -191,6 +245,25 @@ impl MaybeSecretRef { secret_ref_str: value, }) } + } else if value.starts_with("${env:") && value.ends_with('}') { + // Environment variable reference: `${env:}` + // The content is extracted by slicing away the first 6 bytes ("${env:") and the last byte ("}"). + let from_idx_incl = 6; + let till_idx_excl = value.len() - 1; + let name = value[from_idx_incl..till_idx_excl].to_string(); + if name.is_empty() { + Err(MaybeSecretRefParseError::EmptyEnvVarName { + env_ref_str: value, + }) + } else if let Err(e) = validate_env_var_name(&name) { + Err(MaybeSecretRefParseError::InvalidEnvVarName { + env_ref_str: value, + name, + e, + }) + } else { + Ok(MaybeSecretRef::SecretRef(SecretRef::EnvVar { name })) + } } else { Ok(MaybeSecretRef::String(value)) } @@ -213,8 +286,9 @@ impl Display for MaybeSecretRef { #[cfg(test)] mod tests { use super::{ - KubernetesSecretDataKeyParseError, KubernetesSecretNameParseError, MaybeSecretRef, - validate_kubernetes_secret_data_key, validate_kubernetes_secret_name, + EnvVarNameParseError, KubernetesSecretDataKeyParseError, KubernetesSecretNameParseError, + MaybeSecretRef, validate_env_var_name, validate_kubernetes_secret_data_key, + validate_kubernetes_secret_name, }; use super::{MaybeSecretRefParseError, SecretRef}; @@ -228,6 +302,12 @@ mod tests { }), "${secret:kubernetes:example/value}" ); + assert_eq!( + format!("{}", SecretRef::EnvVar { + name: "MY_VAR".to_string(), + }), + "${env:MY_VAR}" + ); } #[test] @@ -453,4 +533,58 @@ mod tests { assert_eq!(validate_kubernetes_secret_data_key(value), expectation); } } + + #[test] + #[rustfmt::skip] // Skip formatting to keep it short + fn env_var_name_validation() { + for (value, expectation) in vec![ + ("A", Ok(())), + ("a", Ok(())), + ("_", Ok(())), + ("A1", Ok(())), + ("MY_VAR", Ok(())), + ("_MY_VAR", Ok(())), + ("MY_VAR_123", Ok(())), + ("", Err(EnvVarNameParseError::Empty)), + ("1A", Err(EnvVarNameParseError::InvalidFormat)), + ("MY-VAR", Err(EnvVarNameParseError::InvalidFormat)), + ("MY VAR", Err(EnvVarNameParseError::InvalidFormat)), + ("MY.VAR", Err(EnvVarNameParseError::InvalidFormat)), + ] { + assert_eq!(validate_env_var_name(value), expectation); + } + } + + #[test] + #[rustfmt::skip] // Skip formatting to keep it short + fn maybe_secret_ref_parse_env_var() { + let values_and_expectations = vec![ + // Valid env var references + ("${env:A}", Ok(MaybeSecretRef::SecretRef(SecretRef::EnvVar { name: "A".to_string() }))), + ("${env:MY_VAR}", Ok(MaybeSecretRef::SecretRef(SecretRef::EnvVar { name: "MY_VAR".to_string() }))), + ("${env:_MY_VAR}", Ok(MaybeSecretRef::SecretRef(SecretRef::EnvVar { name: "_MY_VAR".to_string() }))), + ("${env:MY_VAR_123}", Ok(MaybeSecretRef::SecretRef(SecretRef::EnvVar { name: "MY_VAR_123".to_string() }))), + // Empty name + ("${env:}", Err(MaybeSecretRefParseError::EmptyEnvVarName { + env_ref_str: "${env:}".to_string() + })), + // Invalid name: starts with digit + ("${env:1VAR}", Err(MaybeSecretRefParseError::InvalidEnvVarName { + env_ref_str: "${env:1VAR}".to_string(), + name: "1VAR".to_string(), + e: EnvVarNameParseError::InvalidFormat + })), + // Invalid name: contains hyphen + ("${env:MY-VAR}", Err(MaybeSecretRefParseError::InvalidEnvVarName { + env_ref_str: "${env:MY-VAR}".to_string(), + name: "MY-VAR".to_string(), + e: EnvVarNameParseError::InvalidFormat + })), + // Not an env var reference (no closing brace match for opening pattern) + ("${env:", Ok(MaybeSecretRef::String("${env:".to_string()))), + // Plain strings that look similar but are not env var references + ("$env:MY_VAR}", Ok(MaybeSecretRef::String("$env:MY_VAR}".to_string()))), + ]; + test_values_and_expectations(values_and_expectations); + } } diff --git a/crates/feldera-types/src/secret_resolver.rs b/crates/feldera-types/src/secret_resolver.rs index cda301c194b..66cf07d9870 100644 --- a/crates/feldera-types/src/secret_resolver.rs +++ b/crates/feldera-types/src/secret_resolver.rs @@ -4,6 +4,7 @@ use serde::Serialize; use serde::de::DeserializeOwned; use serde_json::{Map, Value}; use std::collections::BTreeSet; +use std::env; use std::fmt::Debug; use std::fs; use std::io::ErrorKind; @@ -102,6 +103,10 @@ pub enum SecretRefResolutionError { path: String, error_kind: ErrorKind, }, + #[error( + "environment variable reference '{env_ref}' resolution failed: environment variable '{name}' is not set" + )] + EnvVarNotSet { env_ref: SecretRef, name: String }, #[error("secret resolution led to a duplicate key in the mapping, which should not happen")] DuplicateKeyInMapping, #[error("unable to serialize connector configuration: {error}")] @@ -171,7 +176,7 @@ fn resolve_secret_references_in_json( }) } -/// Resolves a string which can potentially be a secret reference. +/// Resolves a string which can potentially be a secret reference or an environment variable reference. fn resolve_potential_secret_reference_string( secrets_dir: &Path, s: String, @@ -179,8 +184,11 @@ fn resolve_potential_secret_reference_string( match MaybeSecretRef::new(s) { Ok(maybe_secret_ref) => match maybe_secret_ref { MaybeSecretRef::String(plain_str) => Ok(plain_str), - MaybeSecretRef::SecretRef(secret_ref) => match &secret_ref { - SecretRef::Kubernetes { name, data_key } => { + MaybeSecretRef::SecretRef(secret_ref) => match secret_ref { + SecretRef::Kubernetes { + ref name, + ref data_key, + } => { // Secret reference: `${secret:kubernetes:/}` // File location: `/kubernetes//` let path = Path::new(secrets_dir) @@ -224,6 +232,20 @@ fn resolve_potential_secret_reference_string( } } } + SecretRef::EnvVar { ref name } => { + // Environment variable reference: `${env:}` + // Resolved by reading the named environment variable from the process. + let name = name.clone(); + match env::var(&name) { + Ok(value) => Ok(value), + Err(env::VarError::NotPresent) | Err(env::VarError::NotUnicode(_)) => { + Err(SecretRefResolutionError::EnvVarNotSet { + env_ref: secret_ref, + name, + }) + } + } + } }, }, Err(e) => Err(SecretRefResolutionError::MaybeSecretRefParseFailed { e }), @@ -565,4 +587,107 @@ mod tests { Some("${secret:kubernetes:e/f}".to_string()) ); } + + #[test] + fn resolve_env_var_success() { + // Set the environment variable + unsafe { + std::env::set_var("FELDERA_TEST_ENV_VAR_ABC123", "my_value"); + } + + let dir = tempfile::tempdir().unwrap(); + assert_eq!( + resolve_potential_secret_reference_string( + dir.path(), + "${env:FELDERA_TEST_ENV_VAR_ABC123}".to_string() + ) + .unwrap(), + "my_value" + ); + + unsafe { + std::env::remove_var("FELDERA_TEST_ENV_VAR_ABC123"); + } + } + + #[test] + fn resolve_env_var_not_set() { + let dir = tempfile::tempdir().unwrap(); + let env_ref_str = "${env:FELDERA_TEST_ENV_VAR_NOT_SET_XYZ}"; + unsafe { + std::env::remove_var("FELDERA_TEST_ENV_VAR_NOT_SET_XYZ"); + } + + let MaybeSecretRef::SecretRef(expected_ref) = + crate::secret_ref::MaybeSecretRef::new(env_ref_str.to_string()).unwrap() + else { + unreachable!(); + }; + + assert_eq!( + resolve_potential_secret_reference_string(dir.path(), env_ref_str.to_string()) + .unwrap_err(), + SecretRefResolutionError::EnvVarNotSet { + env_ref: expected_ref, + name: "FELDERA_TEST_ENV_VAR_NOT_SET_XYZ".to_string(), + } + ); + } + + #[test] + fn resolve_env_var_in_connector_config() { + unsafe { + std::env::set_var("FELDERA_TEST_CONN_VAR_A", "resolved_value_a"); + std::env::set_var("FELDERA_TEST_CONN_VAR_B", "resolved_value_b"); + } + + let connector_config_json = json!({ + "transport": { + "name": "datagen", + "config": { + "plan": [{ + "limit": 2, + "fields": { + "col1": { "values": [1, 2] }, + "col2": { "values": ["${env:FELDERA_TEST_CONN_VAR_A}", "${env:FELDERA_TEST_CONN_VAR_B}"] } + } + }] + } + }, + "format": { + "name": "json", + "config": { + "example": "${env:FELDERA_TEST_CONN_VAR_A}" + } + } + }); + + let connector_config: ConnectorConfig = + serde_json::from_value(connector_config_json).unwrap(); + + let dir = tempfile::tempdir().unwrap(); + let resolved = + resolve_secret_references_in_connector_config(dir.path(), &connector_config).unwrap(); + + let TransportConfig::Datagen(datagen_input_config) = resolved.transport else { + unreachable!(); + }; + assert_eq!( + datagen_input_config.plan[0].fields["col2"] + .values + .as_ref() + .unwrap(), + &vec![json!("resolved_value_a"), json!("resolved_value_b")] + ); + + let Some(format_config) = resolved.format else { + unreachable!(); + }; + assert_eq!(format_config.config, json!({"example": "resolved_value_a"})); + + unsafe { + std::env::remove_var("FELDERA_TEST_CONN_VAR_A"); + std::env::remove_var("FELDERA_TEST_CONN_VAR_B"); + } + } } From 8432965370562e208ea67742612b2cc51f4e4450 Mon Sep 17 00:00:00 2001 From: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> Date: Mon, 16 Mar 2026 14:29:51 +0000 Subject: [PATCH 02/19] docs: document environment variables resolver Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> [ci] apply automatic fixes Signed-off-by: feldera-bot apply suggestions from code review Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> --- crates/feldera-types/src/secret_ref.rs | 18 +-- .../docs/connectors/secret-references.md | 153 ++++++++++++------ 2 files changed, 109 insertions(+), 62 deletions(-) diff --git a/crates/feldera-types/src/secret_ref.rs b/crates/feldera-types/src/secret_ref.rs index b06ddd2b4e7..d7c082abb37 100644 --- a/crates/feldera-types/src/secret_ref.rs +++ b/crates/feldera-types/src/secret_ref.rs @@ -177,9 +177,7 @@ pub enum MaybeSecretRefParseError { name: String, e: EnvVarNameParseError, }, - #[error( - "environment variable reference '{env_ref_str}' is not valid: name cannot be empty" - )] + #[error("environment variable reference '{env_ref_str}' is not valid: name cannot be empty")] EmptyEnvVarName { env_ref_str: String }, } @@ -209,6 +207,7 @@ impl MaybeSecretRef { /// /// Note that here is not checked whether the reference can actually be resolved. pub fn new(value: String) -> Result { + let env_prefix = "${env:"; if value.starts_with("${secret:") && value.ends_with('}') { // Because the pattern only has ASCII characters, they are encoded as single bytes. // The secret reference is extracted by slicing away the first 9 bytes and the last byte. @@ -245,16 +244,15 @@ impl MaybeSecretRef { secret_ref_str: value, }) } - } else if value.starts_with("${env:") && value.ends_with('}') { + } else if value.starts_with(env_prefix) && value.ends_with('}') { // Environment variable reference: `${env:}` // The content is extracted by slicing away the first 6 bytes ("${env:") and the last byte ("}"). - let from_idx_incl = 6; - let till_idx_excl = value.len() - 1; - let name = value[from_idx_incl..till_idx_excl].to_string(); + let name = value + .trim_start_matches(env_prefix) + .trim_end_matches("}") + .to_string(); if name.is_empty() { - Err(MaybeSecretRefParseError::EmptyEnvVarName { - env_ref_str: value, - }) + Err(MaybeSecretRefParseError::EmptyEnvVarName { env_ref_str: value }) } else if let Err(e) = validate_env_var_name(&name) { Err(MaybeSecretRefParseError::InvalidEnvVarName { env_ref_str: value, diff --git a/docs.feldera.com/docs/connectors/secret-references.md b/docs.feldera.com/docs/connectors/secret-references.md index 420efa38a64..9a4d98299a9 100644 --- a/docs.feldera.com/docs/connectors/secret-references.md +++ b/docs.feldera.com/docs/connectors/secret-references.md @@ -4,19 +4,28 @@ Rather than directly supplying a secret (e.g., passwords, PEM, etc.) in the conn configuration as a string, it is possible to refer to (externalize) them. This mechanism in Feldera is called a **secret reference**. -A secret reference is a string in the connector configuration JSON which takes a specific format: +Feldera supports two types of references in connector configuration strings: -``` -${secret::} -``` +- **Secret references** — resolved from an external secret provider (e.g., Kubernetes): + ``` + ${secret::} + ``` It refers to an identifiable secret provided by a provider. Feldera's control plane mounts the secret into the pipeline. When the pipeline initializes, it will replace the secret references in the configuration with their values. We currently only support a single secret provider, Kubernetes. -Feldera resolves secrets when a pipeline starts, as well as each time -it resumes. Feldera does not write resolved values of secrets to -checkpoints or journals. +- **Environment variable references** — resolved from the pipeline process environment: + ``` + ${env:} + ``` + +When the pipeline initializes, it replaces all references in the connector configuration +with their resolved values. Feldera resolves references when a pipeline starts, as well as +each time it resumes. Feldera does not write resolved values to checkpoints or journals. + +Use environment variables for non-sensitive deployment configuration only. +Storing secrets in environment variables is generally discouraged; use a dedicated secret manager or secret store instead. ## Kubernetes @@ -80,7 +89,47 @@ We can then specify a connector configuration that refers to it using `${secret: } ``` -## Restrictions +## Environment variables + +### Usage + +``` +${env:} +``` + +Here, `` is the name of an environment variable following POSIX naming +rules (letters, digits, and underscores, must start with a letter or underscore). + +The reference is resolved at pipeline startup by reading the named variable from the pipeline +process environment. This is useful for injecting configuration values (e.g., hostnames, +credentials) via environment variables set in the `env` field of +[`RuntimeConfig`](/api/patch-pipeline#body-runtime_config) or through the deployment environment. + +### Example + +```json +{ + "transport": { + "name": "kafka_input", + "config": { + "bootstrap.servers": "${env:KAFKA_BOOTSTRAP_SERVERS}", + "sasl.password": "${env:KAFKA_SASL_PASSWORD}" + } + }, + "format": ... +} +``` + +### Restrictions + +- The environment variable name must follow POSIX rules: only letters (`a`–`z`, `A`–`Z`), + digits (`0`–`9`), and underscores (`_`), and must start with a letter or underscore +- If the referenced environment variable is not set when the pipeline starts, the pipeline + will fail to initialize with an error +- It is not possible to have string values starting with `${env:` and ending with `}` + without them being identified as an environment variable reference + +## Restrictions (secret references) - The secret name may only contain lowercase alphanumeric characters or hyphens, must start and end with a lowercase alphanumeric character and can be at most 63 characters long @@ -89,51 +138,51 @@ We can then specify a connector configuration that refers to it using `${secret: - It is not possible to have any plain string value which starts with `${secret:` and ends with `}` without it being identified to be a secret reference. - Only string values in the connector configuration JSON under `transport.config` and `format.config` - can be identified to be secret references (this excludes keys), for example (secret named `a` at - data key `b` has value `value`): - ``` - { - "transport": { - "name": "some_transport", - "config": { - "${secret:kubernetes:a/b}": "${secret:kubernetes:a/b}", - "v1": "${secret:kubernetes:a/b}", - "v2": [ "${secret:kubernetes:a/b}" ] - } - }, - "format": { - "name": "some_format", - "config": { - "v3": "${secret:kubernetes:a/b}" - } - }, - "index": "${secret:kubernetes:a/b}" - } - ``` - ... will be resolved to: - ``` - { - "transport": { - "name": "some_transport", - "config": { - "${secret:kubernetes:a/b}": "value", - "v1": "value", - "v2": [ "value" ] - } - }, - "format": { - "name": "some_format", - "config": { - "v3": "value" - } - }, - "index": "${secret:kubernetes:a/b}" - } - ``` + can be identified to be secret or environment variable references (this excludes keys), for example + (secret named `a` at data key `b` has value `value`): + ``` + { + "transport": { + "name": "some_transport", + "config": { + "${secret:kubernetes:a/b}": "${secret:kubernetes:a/b}", + "v1": "${secret:kubernetes:a/b}", + "v2": [ "${secret:kubernetes:a/b}" ] + } + }, + "format": { + "name": "some_format", + "config": { + "v3": "${secret:kubernetes:a/b}" + } + }, + "index": "${secret:kubernetes:a/b}" + } + ``` + ... will be resolved to: + ``` + { + "transport": { + "name": "some_transport", + "config": { + "${secret:kubernetes:a/b}": "value", + "v1": "value", + "v2": [ "value" ] + } + }, + "format": { + "name": "some_format", + "config": { + "v3": "value" + } + }, + "index": "${secret:kubernetes:a/b}" + } + ``` - Because connector configuration is validated during SQL compilation without secret resolution, string values that require certain format for the connector configuration to be valid will not allow secret references (enumerations in particular, such as for the datagen connector `strategy` field) -- It is not possible to specify a secret value type other than string -- It is not possible to specify a secret as a substring, for example - `abc${secret:kubernetes:a/b}def` does not work +- It is not possible to specify a reference value type other than string +- It is not possible to specify a reference as a substring, for example + `abc${secret:kubernetes:a/b}def` and `abc${env:MY_VAR}def` do not work From 6af3f1b3dc9c1cbdda0ac0025d2745a8dcb6bb83 Mon Sep 17 00:00:00 2001 From: feldera-bot Date: Wed, 8 Apr 2026 17:09:43 +0000 Subject: [PATCH 03/19] [ci] apply automatic fixes Signed-off-by: feldera-bot --- docs.feldera.com/docs/connectors/secret-references.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs.feldera.com/docs/connectors/secret-references.md b/docs.feldera.com/docs/connectors/secret-references.md index 9a4d98299a9..d73d06ea365 100644 --- a/docs.feldera.com/docs/connectors/secret-references.md +++ b/docs.feldera.com/docs/connectors/secret-references.md @@ -24,7 +24,7 @@ When the pipeline initializes, it replaces all references in the connector confi with their resolved values. Feldera resolves references when a pipeline starts, as well as each time it resumes. Feldera does not write resolved values to checkpoints or journals. -Use environment variables for non-sensitive deployment configuration only. +Use environment variables for non-sensitive deployment configuration only. Storing secrets in environment variables is generally discouraged; use a dedicated secret manager or secret store instead. ## Kubernetes From 65eef448db06ca32bf68ca2740116a1ff366004e Mon Sep 17 00:00:00 2001 From: "release-feldera-feldera[bot]" Date: Thu, 9 Apr 2026 08:07:45 +0000 Subject: [PATCH 04/19] ci: Prepare for v0.287.0 --- Cargo.lock | 40 ++++++++++++++++++++-------------------- Cargo.toml | 24 ++++++++++++------------ openapi.json | 2 +- python/pyproject.toml | 2 +- python/uv.lock | 4 ++-- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4a43b6b1e66..b4ca54aa7f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3750,7 +3750,7 @@ dependencies = [ [[package]] name = "dbsp" -version = "0.286.0" +version = "0.287.0" dependencies = [ "anyhow", "arc-swap", @@ -3838,7 +3838,7 @@ dependencies = [ [[package]] name = "dbsp_adapters" -version = "0.286.0" +version = "0.287.0" dependencies = [ "actix", "actix-codec", @@ -3975,7 +3975,7 @@ dependencies = [ [[package]] name = "dbsp_nexmark" -version = "0.286.0" +version = "0.287.0" dependencies = [ "anyhow", "ascii_table", @@ -4850,7 +4850,7 @@ dependencies = [ [[package]] name = "fda" -version = "0.286.0" +version = "0.287.0" dependencies = [ "anyhow", "arrow", @@ -4902,7 +4902,7 @@ dependencies = [ [[package]] name = "feldera-adapterlib" -version = "0.286.0" +version = "0.287.0" dependencies = [ "actix-web", "anyhow", @@ -4933,7 +4933,7 @@ dependencies = [ [[package]] name = "feldera-buffer-cache" -version = "0.286.0" +version = "0.287.0" dependencies = [ "crossbeam-utils", "enum-map", @@ -4961,7 +4961,7 @@ dependencies = [ [[package]] name = "feldera-datagen" -version = "0.286.0" +version = "0.287.0" dependencies = [ "anyhow", "async-channel 2.5.0", @@ -4987,7 +4987,7 @@ dependencies = [ [[package]] name = "feldera-fxp" -version = "0.286.0" +version = "0.287.0" dependencies = [ "bytecheck", "dbsp", @@ -5007,7 +5007,7 @@ dependencies = [ [[package]] name = "feldera-iceberg" -version = "0.286.0" +version = "0.287.0" dependencies = [ "anyhow", "chrono", @@ -5027,7 +5027,7 @@ dependencies = [ [[package]] name = "feldera-ir" -version = "0.286.0" +version = "0.287.0" dependencies = [ "proptest", "proptest-derive", @@ -5039,7 +5039,7 @@ dependencies = [ [[package]] name = "feldera-macros" -version = "0.286.0" +version = "0.287.0" dependencies = [ "prettyplease", "proc-macro2", @@ -5049,7 +5049,7 @@ dependencies = [ [[package]] name = "feldera-observability" -version = "0.286.0" +version = "0.287.0" dependencies = [ "actix-http", "awc", @@ -5064,7 +5064,7 @@ dependencies = [ [[package]] name = "feldera-rest-api" -version = "0.286.0" +version = "0.287.0" dependencies = [ "chrono", "feldera-observability", @@ -5098,7 +5098,7 @@ dependencies = [ [[package]] name = "feldera-sqllib" -version = "0.286.0" +version = "0.287.0" dependencies = [ "arcstr", "base58", @@ -5141,7 +5141,7 @@ dependencies = [ [[package]] name = "feldera-storage" -version = "0.286.0" +version = "0.287.0" dependencies = [ "anyhow", "crossbeam", @@ -5164,7 +5164,7 @@ dependencies = [ [[package]] name = "feldera-types" -version = "0.286.0" +version = "0.287.0" dependencies = [ "actix-web", "anyhow", @@ -8074,7 +8074,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipeline-manager" -version = "0.286.0" +version = "0.287.0" dependencies = [ "actix-cors", "actix-files", @@ -9168,7 +9168,7 @@ dependencies = [ [[package]] name = "readers" -version = "0.286.0" +version = "0.287.0" dependencies = [ "async-std", "csv", @@ -10744,7 +10744,7 @@ dependencies = [ [[package]] name = "sltsqlvalue" -version = "0.286.0" +version = "0.287.0" dependencies = [ "dbsp", "feldera-sqllib", @@ -11047,7 +11047,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "storage-test-compat" -version = "0.286.0" +version = "0.287.0" dependencies = [ "dbsp", "derive_more 1.0.0", diff --git a/Cargo.toml b/Cargo.toml index 233ee490bf4..6ba7adb024a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace.package] authors = ["Feldera Team "] -version = "0.286.0" +version = "0.287.0" license = "MIT OR Apache-2.0" homepage = "https://github.com/feldera/feldera" repository = "https://github.com/feldera/feldera" @@ -102,7 +102,7 @@ csv = "1.2.2" csv-core = "0.1.10" dashmap = "6.1.0" datafusion = "51.0" -dbsp = { path = "crates/dbsp", version = "0.286.0" } +dbsp = { path = "crates/dbsp", version = "0.287.0" } dbsp_nexmark = { path = "crates/nexmark" } deadpool-postgres = "0.14.1" #deltalake = "0.30.2" @@ -122,19 +122,19 @@ erased-serde = "0.3.31" fake = "2.10" fastbloom = "0.14.0" fdlimit = "0.3.0" -feldera-buffer-cache = { version = "0.286.0", path = "crates/buffer-cache" } +feldera-buffer-cache = { version = "0.287.0", path = "crates/buffer-cache" } feldera-cloud1-client = "0.1.2" feldera-datagen = { path = "crates/datagen" } -feldera-fxp = { version = "0.286.0", path = "crates/fxp", features = ["dbsp"] } +feldera-fxp = { version = "0.287.0", path = "crates/fxp", features = ["dbsp"] } feldera-iceberg = { path = "crates/iceberg" } -feldera-observability = { version = "0.286.0", path = "crates/feldera-observability" } -feldera-macros = { version = "0.286.0", path = "crates/feldera-macros" } -feldera-sqllib = { version = "0.286.0", path = "crates/sqllib" } -feldera-storage = { version = "0.286.0", path = "crates/storage" } -feldera-types = { version = "0.286.0", path = "crates/feldera-types" } -feldera-rest-api = { version = "0.286.0", path = "crates/rest-api" } -feldera-ir = { version = "0.286.0", path = "crates/ir" } -feldera-adapterlib = { version = "0.286.0", path = "crates/adapterlib" } +feldera-observability = { version = "0.287.0", path = "crates/feldera-observability" } +feldera-macros = { version = "0.287.0", path = "crates/feldera-macros" } +feldera-sqllib = { version = "0.287.0", path = "crates/sqllib" } +feldera-storage = { version = "0.287.0", path = "crates/storage" } +feldera-types = { version = "0.287.0", path = "crates/feldera-types" } +feldera-rest-api = { version = "0.287.0", path = "crates/rest-api" } +feldera-ir = { version = "0.287.0", path = "crates/ir" } +feldera-adapterlib = { version = "0.287.0", path = "crates/adapterlib" } flate2 = "1.1.0" form_urlencoded = "1.2.0" futures = "0.3.30" diff --git a/openapi.json b/openapi.json index b571ec3be9f..f5407f2a25e 100644 --- a/openapi.json +++ b/openapi.json @@ -10,7 +10,7 @@ "license": { "name": "MIT OR Apache-2.0" }, - "version": "0.286.0" + "version": "0.287.0" }, "paths": { "/config/authentication": { diff --git a/python/pyproject.toml b/python/pyproject.toml index 09a9b7f06ab..d36c73d36e7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "feldera" readme = "README.md" description = "The feldera python client" -version = "0.286.0" +version = "0.287.0" license = "MIT" requires-python = ">=3.10" authors = [ diff --git a/python/uv.lock b/python/uv.lock index 4ddba81204f..cb5547411af 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -12,7 +12,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-01T21:26:07.053428Z" +exclude-newer = "2026-04-02T08:07:18.404331674Z" exclude-newer-span = "P1W" [[package]] @@ -221,7 +221,7 @@ wheels = [ [[package]] name = "feldera" -version = "0.286.0" +version = "0.287.0" source = { editable = "." } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, From d23c62753bf5f054622cbd1568bd495cf6acff77 Mon Sep 17 00:00:00 2001 From: Mattias Matthiesen Date: Fri, 13 Mar 2026 07:50:10 +0100 Subject: [PATCH 05/19] [python] Add optional arrow dependency and installation docs Signed-off-by: Mattias Matthiesen --- python/README.md | 6 ++++ python/pyproject.toml | 9 +++++- python/uv.lock | 66 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index 152765dd2e6..0693a254194 100644 --- a/python/README.md +++ b/python/README.md @@ -12,6 +12,12 @@ The Python SDK documentation is available at: https://docs.feldera.com/python uv pip install feldera ``` +For Arrow IPC query support, install the optional Arrow extra: + +```bash +uv pip install 'feldera[arrow]' +``` + ### Example usage The Python client interacts with the API server of the Feldera instance. diff --git a/python/pyproject.toml b/python/pyproject.toml index d36c73d36e7..bdea1193c4f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -29,6 +29,12 @@ dependencies = [ "ruff>=0.6.9", "PyJWT>=2.12.0", ] + +[project.optional-dependencies] +arrow = [ + "pyarrow>=14.0", +] + [project.urls] Homepage = "https://www.feldera.com" Documentation = "https://docs.feldera.com/python" @@ -43,7 +49,8 @@ dev = [ "sphinx-rtd-theme==2.0.0", "sphinx==7.3.7", "simplejson==3.20.1", - "confluent-kafka>=2.2.0" + "confluent-kafka>=2.2.0", + "pyarrow>=14.0", ] [tool.pytest.ini_options] diff --git a/python/uv.lock b/python/uv.lock index cb5547411af..aeb987e2d87 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -235,9 +235,15 @@ dependencies = [ { name = "typing-extensions" }, ] +[package.optional-dependencies] +arrow = [ + { name = "pyarrow" }, +] + [package.dev-dependencies] dev = [ { name = "confluent-kafka" }, + { name = "pyarrow" }, { name = "pytest" }, { name = "pytest-timeout" }, { name = "pytest-xdist" }, @@ -251,15 +257,18 @@ requires-dist = [ { name = "numpy", specifier = ">=2.2.4" }, { name = "pandas", specifier = ">=2.1.2" }, { name = "pretty-errors" }, + { name = "pyarrow", marker = "extra == 'arrow'", specifier = ">=14.0" }, { name = "pyjwt", specifier = ">=2.12.0" }, { name = "requests" }, { name = "ruff", specifier = ">=0.6.9" }, { name = "typing-extensions" }, ] +provides-extras = ["arrow"] [package.metadata.requires-dev] dev = [ { name = "confluent-kafka", specifier = ">=2.2.0" }, + { name = "pyarrow", specifier = ">=14.0" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-timeout", specifier = ">=2.3.1" }, { name = "pytest-xdist", specifier = ">=3.8.0" }, @@ -706,6 +715,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6d/8e/2df7467a15eae40e26c476683962fdb810cd1b36676603e2f139b4abbeaf/pretty_errors-1.2.25-py3-none-any.whl", hash = "sha256:8ce68ccd99e0f2a099265c8c1f1c23b7c60a15d69bb08816cb336e237d5dc983", size = 17195, upload-time = "2021-11-24T14:32:07.762Z" }, ] +[[package]] +name = "pyarrow" +version = "23.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/a8/24e5dc6855f50a62936ceb004e6e9645e4219a8065f304145d7fb8a79d5d/pyarrow-23.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:3fab8f82571844eb3c460f90a75583801d14ca0cc32b1acc8c361650e006fd56", size = 34307390, upload-time = "2026-02-16T10:08:08.654Z" }, + { url = "https://files.pythonhosted.org/packages/bc/8e/4be5617b4aaae0287f621ad31c6036e5f63118cfca0dc57d42121ff49b51/pyarrow-23.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3f91c038b95f71ddfc865f11d5876c42f343b4495535bd262c7b321b0b94507c", size = 35853761, upload-time = "2026-02-16T10:08:17.811Z" }, + { url = "https://files.pythonhosted.org/packages/2e/08/3e56a18819462210432ae37d10f5c8eed3828be1d6c751b6e6a2e93c286a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d0744403adabef53c985a7f8a082b502a368510c40d184df349a0a8754533258", size = 44493116, upload-time = "2026-02-16T10:08:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/f8/82/c40b68001dbec8a3faa4c08cd8c200798ac732d2854537c5449dc859f55a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c33b5bf406284fd0bba436ed6f6c3ebe8e311722b441d89397c54f871c6863a2", size = 47564532, upload-time = "2026-02-16T10:08:34.27Z" }, + { url = "https://files.pythonhosted.org/packages/20/bc/73f611989116b6f53347581b02177f9f620efdf3cd3f405d0e83cdf53a83/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ddf743e82f69dcd6dbbcb63628895d7161e04e56794ef80550ac6f3315eeb1d5", size = 48183685, upload-time = "2026-02-16T10:08:42.889Z" }, + { url = "https://files.pythonhosted.org/packages/b0/cc/6c6b3ecdae2a8c3aced99956187e8302fc954cc2cca2a37cf2111dad16ce/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e052a211c5ac9848ae15d5ec875ed0943c0221e2fcfe69eee80b604b4e703222", size = 50605582, upload-time = "2026-02-16T10:08:51.641Z" }, + { url = "https://files.pythonhosted.org/packages/8d/94/d359e708672878d7638a04a0448edf7c707f9e5606cee11e15aaa5c7535a/pyarrow-23.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:5abde149bb3ce524782d838eb67ac095cd3fd6090eba051130589793f1a7f76d", size = 27521148, upload-time = "2026-02-16T10:08:58.077Z" }, + { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, + { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, + { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, + { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, + { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, + { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, + { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, + { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" }, + { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" }, + { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" }, + { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" }, + { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" }, + { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" }, + { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" }, + { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" }, + { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" }, + { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" }, + { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" }, + { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" }, + { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" }, + { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" }, + { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" }, + { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" }, + { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" }, + { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" }, + { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" }, + { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" }, + { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" }, + { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" }, + { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" }, + { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" }, + { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" }, + { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" }, +] + [[package]] name = "pygments" version = "2.19.2" From c4c0fe26fb4b7b5da088045f50d20fa5b0351a48 Mon Sep 17 00:00:00 2001 From: Mattias Matthiesen Date: Fri, 13 Mar 2026 07:50:10 +0100 Subject: [PATCH 06/19] [python] Add Arrow IPC query API to client and pipeline Signed-off-by: Mattias Matthiesen --- python/feldera/pipeline.py | 33 ++++++++++++++++++- python/feldera/rest/feldera_client.py | 47 ++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/python/feldera/pipeline.py b/python/feldera/pipeline.py index 19ce9eb46a4..0f5ef098043 100644 --- a/python/feldera/pipeline.py +++ b/python/feldera/pipeline.py @@ -4,7 +4,16 @@ from collections import deque from datetime import datetime from threading import Event -from typing import Any, Callable, Dict, Generator, List, Mapping, Optional +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Generator, + List, + Mapping, + Optional, +) from uuid import UUID import pandas @@ -36,6 +45,9 @@ from feldera.stats import InputEndpointStatus, OutputEndpointStatus, PipelineStatistics from feldera.types import CheckpointMetadata +if TYPE_CHECKING: + import pyarrow as pa + class Pipeline: def __init__(self, client: FelderaClient): @@ -977,6 +989,25 @@ def query_parquet(self, query: str, path: str): self.client.query_as_parquet(self.name, query, path) + def query_arrow(self, query: str) -> Generator["pa.RecordBatch", None, None]: + """ + Executes an ad-hoc SQL query on this pipeline and returns a generator + that yields the result as PyArrow RecordBatches. + + Note: + You can only ``SELECT`` from materialized tables and views. + + :param query: The SQL query to be executed. + + :raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED + state. + :raises FelderaAPIError: If querying a non materialized table or view. + :raises FelderaAPIError: If the query is invalid. + + :return: A generator that yields ``pyarrow.RecordBatch`` objects. + """ + return self.client.query_as_arrow(self.name, query) + def query_tabular(self, query: str) -> Generator[str, None, None]: """ Executes a SQL query on this pipeline and returns the result as a diff --git a/python/feldera/rest/feldera_client.py b/python/feldera/rest/feldera_client.py index 7d20e9c878c..e9761fed485 100644 --- a/python/feldera/rest/feldera_client.py +++ b/python/feldera/rest/feldera_client.py @@ -3,7 +3,7 @@ import pathlib import time from decimal import Decimal -from typing import Any, Dict, Generator, Mapping, Optional +from typing import TYPE_CHECKING, Any, Dict, Generator, Mapping, Optional from urllib.parse import quote import requests @@ -18,6 +18,9 @@ logger = logging.getLogger(__name__) +if TYPE_CHECKING: + import pyarrow as pa + def _validate_no_none_keys_in_map(data): def validate_no_none_keys(d: Dict[Any, Any]) -> None: @@ -38,6 +41,17 @@ def _prepare_boolean_input(value: bool) -> str: return "true" if value else "false" +def _import_pyarrow_ipc(): + try: + import pyarrow.ipc as ipc + except ImportError as exc: + raise ImportError( + "pyarrow is required for Arrow IPC queries. Install it with `pip install feldera[arrow]`." + ) from exc + + return ipc + + class FelderaClient: """ A client for the Feldera HTTP API. @@ -1217,6 +1231,37 @@ def query_as_parquet(self, pipeline_name: str, query: str, path: str): file.write(chunk) file.close() + def query_as_arrow( + self, pipeline_name: str, query: str + ) -> Generator["pa.RecordBatch", None, None]: + """ + Executes an ad-hoc query on the specified pipeline and returns the result + as a generator that yields PyArrow RecordBatches. + + :param pipeline_name: The name of the pipeline to query. + :param query: The SQL query to be executed. + :return: A generator that yields each query batch as a ``pyarrow.RecordBatch``. + """ + ipc = _import_pyarrow_ipc() + + params = { + "pipeline_name": pipeline_name, + "sql": query, + "format": "arrow_ipc", + } + resp: requests.Response = self.http.get( + path=f"/pipelines/{pipeline_name}/query", + params=params, + stream=True, + ) + + try: + with ipc.open_stream(resp.raw) as reader: + for batch in reader: + yield batch + finally: + resp.close() + def query_as_json( self, pipeline_name: str, query: str ) -> Generator[Mapping[str, Any], None, None]: From 0be980450ad5ce67622ad79ba3dbee5c26813f7e Mon Sep 17 00:00:00 2001 From: Mattias Matthiesen Date: Fri, 13 Mar 2026 07:50:10 +0100 Subject: [PATCH 07/19] [python] Add tests for Arrow IPC query results Signed-off-by: Mattias Matthiesen --- python/tests/platform/test_shared_pipeline.py | 39 ++++- python/tests/unit/test_query_as_arrow.py | 145 ++++++++++++++++++ 2 files changed, 181 insertions(+), 3 deletions(-) create mode 100644 python/tests/unit/test_query_as_arrow.py diff --git a/python/tests/platform/test_shared_pipeline.py b/python/tests/platform/test_shared_pipeline.py index 4a06db367be..6b9cf86cbba 100644 --- a/python/tests/platform/test_shared_pipeline.py +++ b/python/tests/platform/test_shared_pipeline.py @@ -1,3 +1,4 @@ +import gzip import io import json import os @@ -6,9 +7,9 @@ import time import unittest import zipfile -import gzip import pandas as pd +import pytest from feldera import Pipeline from feldera.enums import CompletionTokenStatus, PipelineFieldSelector, PipelineStatus @@ -167,6 +168,36 @@ def test_adhoc_query_json(self): got = list(resp) self.assertCountEqual(got, expected) + def test_adhoc_query_arrow(self): + pa = pytest.importorskip("pyarrow") + + data = "1\n2\n" + self.pipeline.start() + TEST_CLIENT.push_to_pipeline(self.pipeline.name, "tbl", "csv", data) + + expected_rows = list( + TEST_CLIENT.query_as_json( + self.pipeline.name, + "SELECT * FROM tbl ORDER BY id", + ) + ) + expected_ids = [row["id"] for row in expected_rows] + + batches_client = list( + TEST_CLIENT.query_as_arrow( + self.pipeline.name, + "SELECT * FROM tbl ORDER BY id", + ) + ) + table_client = pa.Table.from_batches(batches_client) + assert table_client.column("id").to_pylist() == expected_ids + + batches_pipeline = list( + self.pipeline.query_arrow("SELECT * FROM tbl ORDER BY id") + ) + table_pipeline = pa.Table.from_batches(batches_pipeline) + assert table_pipeline.column("id").to_pylist() == expected_ids + def test_local(self): """ CREATE TABLE students ( @@ -347,8 +378,10 @@ def test_failed_pipeline_stop(self): self.pipeline.input_json("tbl", data, wait=False) wait_for_condition( "pipeline stops with deployment error after worker panic", - lambda: self.pipeline.status() == PipelineStatus.STOPPED - and len(self.pipeline.deployment_error()) > 0, + lambda: ( + self.pipeline.status() == PipelineStatus.STOPPED + and len(self.pipeline.deployment_error()) > 0 + ), timeout_s=20.0, poll_interval_s=1.0, ) diff --git a/python/tests/unit/test_query_as_arrow.py b/python/tests/unit/test_query_as_arrow.py new file mode 100644 index 00000000000..b90d228c12a --- /dev/null +++ b/python/tests/unit/test_query_as_arrow.py @@ -0,0 +1,145 @@ +"""Unit tests for FelderaClient.query_as_arrow and Pipeline.query_arrow.""" + +import builtins +import io +import sys +from unittest.mock import MagicMock + +import pytest + +from feldera.rest.feldera_client import FelderaClient + + +def _import_arrow_modules(): + pa = pytest.importorskip("pyarrow") + ipc = pytest.importorskip("pyarrow.ipc") + return pa, ipc + + +def _make_ipc_bytes(table) -> bytes: + """Serialise a ``pyarrow.Table`` to Arrow IPC stream bytes.""" + _, ipc = _import_arrow_modules() + buf = io.BytesIO() + with ipc.new_stream(buf, table.schema) as writer: + if table.num_rows > 0: + writer.write_table(table) + return buf.getvalue() + + +def _mock_response(ipc_bytes: bytes) -> MagicMock: + """Return a mock response whose ``raw`` is an Arrow IPC byte stream.""" + resp = MagicMock() + resp.raw = io.BytesIO(ipc_bytes) + return resp + + +@pytest.fixture() +def client() -> FelderaClient: + """A ``FelderaClient`` with a mocked HTTP layer (no real network calls).""" + c = FelderaClient.__new__(FelderaClient) + c.http = MagicMock() + return c + + +class TestQueryAsArrow: + def test_non_empty_result_yields_correct_data(self, client: FelderaClient): + pa, _ = _import_arrow_modules() + schema = pa.schema([("id", pa.int64()), ("name", pa.utf8())]) + expected = pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]}, schema=schema) + client.http.get.return_value = _mock_response(_make_ipc_bytes(expected)) + + batches = list(client.query_as_arrow("my_pipeline", "SELECT id, name FROM t")) + result = pa.Table.from_batches(batches, schema=schema) + + assert len(batches) > 0 + assert result.schema == schema + assert result.num_rows == 3 + assert result.column("id").to_pylist() == [1, 2, 3] + assert result.column("name").to_pylist() == ["a", "b", "c"] + + def test_http_called_with_correct_params(self, client: FelderaClient): + pa, _ = _import_arrow_modules() + schema = pa.schema([("id", pa.int64())]) + table = pa.table({"id": [42]}, schema=schema) + client.http.get.return_value = _mock_response(_make_ipc_bytes(table)) + + list(client.query_as_arrow("my_pipeline", "SELECT id FROM t")) + + client.http.get.assert_called_once_with( + path="/pipelines/my_pipeline/query", + params={ + "pipeline_name": "my_pipeline", + "sql": "SELECT id FROM t", + "format": "arrow_ipc", + }, + stream=True, + ) + + def test_empty_result_yields_no_batches(self, client: FelderaClient): + pa, _ = _import_arrow_modules() + schema = pa.schema([("id", pa.int64()), ("value", pa.float64())]) + empty = pa.table( + { + "id": pa.array([], type=pa.int64()), + "value": pa.array([], type=pa.float64()), + }, + schema=schema, + ) + client.http.get.return_value = _mock_response(_make_ipc_bytes(empty)) + + result_batches = list( + client.query_as_arrow("my_pipeline", "SELECT id, value FROM t WHERE false") + ) + + assert result_batches == [] + + def test_missing_pyarrow_raises_helpful_import_error( + self, client: FelderaClient, monkeypatch + ): + real_import = builtins.__import__ + + def _import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "pyarrow" or name.startswith("pyarrow."): + raise ImportError("No module named 'pyarrow'") + return real_import(name, globals, locals, fromlist, level) + + monkeypatch.delitem(sys.modules, "pyarrow", raising=False) + monkeypatch.delitem(sys.modules, "pyarrow.ipc", raising=False) + monkeypatch.setattr(builtins, "__import__", _import) + + with pytest.raises(ImportError, match="pip install feldera\\[arrow\\]"): + next(client.query_as_arrow("my_pipeline", "SELECT 1")) + + client.http.get.assert_not_called() + + def test_response_closed_after_full_consumption(self, client: FelderaClient): + pa, _ = _import_arrow_modules() + schema = pa.schema([("id", pa.int64())]) + table = pa.table({"id": [1, 2]}, schema=schema) + resp = _mock_response(_make_ipc_bytes(table)) + client.http.get.return_value = resp + + list(client.query_as_arrow("my_pipeline", "SELECT id FROM t")) + + resp.close.assert_called_once() + + +class TestPipelineQueryArrow: + def test_query_arrow_delegates_to_client(self): + """Pipeline.query_arrow must forward to client.query_as_arrow.""" + from feldera.pipeline import Pipeline + + pipeline = Pipeline.__new__(Pipeline) + pipeline._inner = MagicMock() + pipeline._inner.name = "pipe1" + pipeline.client = MagicMock() + + expected = object() + pipeline.client.query_as_arrow.return_value = expected + + result = pipeline.query_arrow("SELECT x FROM v") + + pipeline.client.query_as_arrow.assert_called_once_with( + "pipe1", "SELECT x FROM v" + ) + assert result is expected From 315c769f528725eb1b91bd253d0b8e52a263f1b4 Mon Sep 17 00:00:00 2001 From: Gerd Zellweger Date: Fri, 10 Apr 2026 21:55:47 -0700 Subject: [PATCH 08/19] [types] Fix f64 deserialization in DevTweaks inside flattened PipelineConfig DevTweaks Option fields (bloom_false_positive_rate, balancer_balance_tax, etc.) fail to deserialize when nested inside PipelineConfig, which uses triggered by the workspace-wide arbitrary_precision feature: the Content buffer represents numbers as maps, producing "invalid type: map, expected f64". Apply the existing serde_via_value::deserialize workaround (already used by ResourceConfig.cpu_cores_min/max) to all four Option fields in DevTweaks. Also add the chrono "alloc" feature to fix a pre-existing compile error in adapter_stats.rs (to_rfc3339_opts returns String, which requires alloc). Signed-off-by: Gerd Zellweger --- crates/feldera-types/Cargo.toml | 2 +- crates/feldera-types/src/config/dev_tweaks.rs | 66 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/crates/feldera-types/Cargo.toml b/crates/feldera-types/Cargo.toml index 54fea8c7763..b5c291dea00 100644 --- a/crates/feldera-types/Cargo.toml +++ b/crates/feldera-types/Cargo.toml @@ -31,7 +31,7 @@ erased-serde = { workspace = true } uuid = { workspace = true, features = ["v7"] } thiserror = { workspace = true } regex = { workspace = true } -chrono = { workspace = true, features = ["serde"] } +chrono = { workspace = true, features = ["serde", "alloc"] } feldera-ir = { workspace = true } time = { workspace = true } bytemuck = { workspace = true } diff --git a/crates/feldera-types/src/config/dev_tweaks.rs b/crates/feldera-types/src/config/dev_tweaks.rs index 057daa4c3cb..2f5e18732d8 100644 --- a/crates/feldera-types/src/config/dev_tweaks.rs +++ b/crates/feldera-types/src/config/dev_tweaks.rs @@ -107,6 +107,7 @@ pub struct DevTweaks { /// /// The default value is 1.2. #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, deserialize_with = "crate::serde_via_value::deserialize")] pub balancer_min_relative_improvement_threshold: Option, /// The minimum absolute improvement threshold for the balancer. @@ -138,6 +139,7 @@ pub struct DevTweaks { /// /// The default value is 1.1. #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, deserialize_with = "crate::serde_via_value::deserialize")] pub balancer_balance_tax: Option, /// The balancer threshold for checking for an improved partitioning policy for a stream. @@ -151,6 +153,7 @@ pub struct DevTweaks { /// /// The default value is 0.1. #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, deserialize_with = "crate::serde_via_value::deserialize")] pub balancer_key_distribution_refresh_threshold: Option, /// False-positive rate for Bloom filters on batches on storage, as a @@ -168,6 +171,7 @@ pub struct DevTweaks { /// /// Values outside the valid range, such as 0.0, disable Bloom filters. #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, deserialize_with = "crate::serde_via_value::deserialize")] pub bloom_false_positive_rate: Option, /// Maximum batch size in records for level 0 merges. @@ -284,3 +288,65 @@ pub enum MergerType { #[default] ListMerger, } + +#[cfg(test)] +mod tests { + use crate::config::{PipelineConfig, RuntimeConfig}; + + use super::*; + + /// Regression test: `Option` fields inside `DevTweaks` must + /// survive a JSON-string round-trip through `PipelineConfig`, which + /// uses `#[serde(flatten)]` on `RuntimeConfig`. With `serde_json`'s + /// `arbitrary_precision` feature enabled, the serde `Content` buffer + /// represents numbers as maps, which breaks plain `f64` + /// deserialization (serde-rs/json#1157). The `serde_via_value` + /// workaround on each `Option` field fixes this. + #[test] + fn dev_tweaks_f64_roundtrip_through_pipeline_config() { + let mut rc = RuntimeConfig::default(); + rc.dev_tweaks = DevTweaks { + bloom_false_positive_rate: Some(0.0), + balancer_balance_tax: Some(1.1), + balancer_min_relative_improvement_threshold: Some(1.2), + balancer_key_distribution_refresh_threshold: Some(0.1), + ..Default::default() + }; + let pc = PipelineConfig { + global: rc, + multihost: None, + name: Some("test-pipeline".into()), + given_name: None, + storage_config: None, + secrets_dir: None, + inputs: Default::default(), + outputs: Default::default(), + program_ir: None, + }; + + // JSON string round-trip (the path the pipeline process takes). + let json = serde_json::to_string_pretty(&pc).unwrap(); + let pc2: PipelineConfig = serde_json::from_str(&json) + .expect("JSON string round-trip of PipelineConfig with f64 dev_tweaks must succeed"); + assert_eq!(pc2.global.dev_tweaks.bloom_false_positive_rate, Some(0.0)); + assert_eq!(pc2.global.dev_tweaks.balancer_balance_tax, Some(1.1)); + assert_eq!( + pc2.global + .dev_tweaks + .balancer_min_relative_improvement_threshold, + Some(1.2) + ); + assert_eq!( + pc2.global + .dev_tweaks + .balancer_key_distribution_refresh_threshold, + Some(0.1) + ); + + // serde_json::Value round-trip (the path the pipeline manager takes). + let value = serde_json::to_value(&pc).unwrap(); + let pc3: PipelineConfig = serde_json::from_value(value) + .expect("Value round-trip of PipelineConfig with f64 dev_tweaks must succeed"); + assert_eq!(pc3.global.dev_tweaks.bloom_false_positive_rate, Some(0.0)); + } +} From d5343c2fa24acff78b732670b4f27e80d92cf520 Mon Sep 17 00:00:00 2001 From: Mihai Budiu Date: Sat, 11 Apr 2026 07:18:20 -0700 Subject: [PATCH 09/19] [SQL] Fix incorrect documentation for TO_HEX function Signed-off-by: Mihai Budiu --- docs.feldera.com/docs/sql/binary.md | 4 ++-- .../sql/postgres/PostgresStringTests.java | 1 - .../compiler/sql/simple/Regression2Tests.java | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/docs.feldera.com/docs/sql/binary.md b/docs.feldera.com/docs/sql/binary.md index 5677e88dfd7..c5696cca759 100644 --- a/docs.feldera.com/docs/sql/binary.md +++ b/docs.feldera.com/docs/sql/binary.md @@ -111,8 +111,8 @@ aggregation functions `BIT_AND`, `BIT_OR`, and `BIT_XOR`. TO_HEX(binary) - Generate a `VARCHAR` string describing the value in hexadecimal - TO_HEX(x'0abc') => '0ABC' + Generate a `VARCHAR` string describing the value in hexadecimal (lowercase) + TO_HEX(x'0abc') => '0abc' TO_INT(binary) diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/postgres/PostgresStringTests.java b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/postgres/PostgresStringTests.java index 650882dd671..1051c576d19 100644 --- a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/postgres/PostgresStringTests.java +++ b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/postgres/PostgresStringTests.java @@ -1117,7 +1117,6 @@ public void testSplitPart() { } - // TODO: to_hex // TODO: sha, encode, decode @Test diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java index c35ffb16445..cb7470213f8 100644 --- a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java +++ b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java @@ -903,4 +903,20 @@ h DECIMAL(38, 10), l TIMESTAMP ) WITH ('materialized' = 'true');"""); } + + @Test + public void issue5981() { + this.qs(""" + SELECT TO_HEX(x'48656c6c6f'); + r + --- + 48656c6c6f + (1 row) + + SELECT TO_HEX(x'0ABC'); + r + --- + 0abc + (1 row)"""); + } } From c6e024788d982f32d52b0674f4bfaf3e0b0c42aa Mon Sep 17 00:00:00 2001 From: Gerd Zellweger Date: Fri, 10 Apr 2026 21:05:44 -0700 Subject: [PATCH 10/19] Add output reset API skeleton --- crates/adapters/src/server.rs | 8 ++ .../src/api/endpoints/pipeline_interaction.rs | 79 +++++++++++++++++++ crates/pipeline-manager/src/api/main.rs | 2 + 3 files changed, 89 insertions(+) diff --git a/crates/adapters/src/server.rs b/crates/adapters/src/server.rs index 58d886153c8..5c16cd35717 100644 --- a/crates/adapters/src/server.rs +++ b/crates/adapters/src/server.rs @@ -1226,6 +1226,7 @@ where .service(start_input_endpoint) .service(input_endpoint_status) .service(output_endpoint_status) + .service(reset_output_endpoint) .service(rebalance) .service(coordination_activate_handler) .service(coordination_status) @@ -2290,6 +2291,13 @@ async fn output_endpoint_status( Ok(HttpResponse::Ok().json(state.controller()?.output_endpoint_status(&path)?)) } +#[post("/output_endpoints/{endpoint_name}/reset")] +async fn reset_output_endpoint(path: web::Path) -> Result { + Err(PipelineError::from(ControllerError::not_supported( + &format!("output endpoint '{}' does not support reset", path.as_str()), + ))) +} + /// This service journals the paused state, but it does not wait for the journal /// record to commit before it returns success, so there is a small race. #[get("/input_endpoints/{endpoint_name}/start")] diff --git a/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs b/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs index c4bd3ef4fef..5ebb8840b35 100644 --- a/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs +++ b/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs @@ -460,6 +460,85 @@ pub(crate) async fn get_pipeline_output_connector_status( Ok(response) } +/// Reset Output Connector +/// +/// Reset an output connector configured in `snapshot_and_follow` mode. +/// +/// This clears buffered output, asks the sink to reset itself, and then replays +/// a full snapshot before resuming incremental updates. +#[utoipa::path( + context_path = "/v0", + security(("JSON web token (JWT) or API key" = [])), + params( + ("pipeline_name" = String, Path, description = "Unique pipeline name"), + ("view_name" = String, Path, description = "SQL view name"), + ("connector_name" = String, Path, description = "Output connector name"), + ), + responses( + (status = OK + , description = "Output connector reset request has been processed"), + (status = NOT_FOUND + , body = ErrorResponse + , description = "Pipeline, view and/or output connector with that name does not exist" + , examples( + ("Pipeline with that name does not exist" = (value = json!(examples::error_unknown_pipeline_name()))), + ) + ), + (status = BAD_REQUEST + , body = ErrorResponse + , description = "The output connector does not support reset"), + (status = SERVICE_UNAVAILABLE + , body = ErrorResponse + , examples( + ("Pipeline is not deployed" = (value = json!(examples::error_pipeline_interaction_not_deployed()))), + ("Pipeline is currently unavailable" = (value = json!(examples::error_pipeline_interaction_currently_unavailable()))), + ("Disconnected during response" = (value = json!(examples::error_pipeline_interaction_disconnected()))), + ("Response timeout" = (value = json!(examples::error_pipeline_interaction_timeout()))) + ) + ), + (status = INTERNAL_SERVER_ERROR, body = ErrorResponse), + ), + tag = "Output Connectors" +)] +#[post("/pipelines/{pipeline_name}/views/{view_name}/connectors/{connector_name}/reset")] +pub(crate) async fn post_pipeline_output_connector_reset( + state: WebData, + client: WebData, + tenant_id: ReqData, + path: web::Path<(String, String, String)>, +) -> Result { + let (pipeline_name, view_name, connector_name) = path.into_inner(); + + let actual_view_name = SqlIdentifier::from(&view_name).name(); + let endpoint_name = format!("{actual_view_name}.{connector_name}"); + let encoded_endpoint_name = urlencoding::encode(&endpoint_name).to_string(); + + let response = state + .runner + .forward_http_request_to_pipeline_by_name( + client.as_ref(), + *tenant_id, + &pipeline_name, + Method::POST, + &format!("output_endpoints/{encoded_endpoint_name}/reset"), + "", + None, + None, + ) + .await?; + + if response.status() == StatusCode::OK { + info!( + pipeline = %pipeline_name, + pipeline_id = "N/A", + tenant = %tenant_id.0, + "Connector action: reset on view '{view_name}' on connector '{connector_name}'" + ); + } + + Ok(response) +} + /// Get Pipeline Stats /// /// Retrieve statistics (e.g., performance counters) of a running or paused pipeline. diff --git a/crates/pipeline-manager/src/api/main.rs b/crates/pipeline-manager/src/api/main.rs index a70b6e0151a..95c2e9b440c 100644 --- a/crates/pipeline-manager/src/api/main.rs +++ b/crates/pipeline-manager/src/api/main.rs @@ -204,6 +204,7 @@ It contains the following fields: endpoints::pipeline_interaction::post_pipeline_input_connector_action, endpoints::pipeline_interaction::get_pipeline_input_connector_status, endpoints::pipeline_interaction::get_pipeline_output_connector_status, + endpoints::pipeline_interaction::post_pipeline_output_connector_reset, endpoints::pipeline_interaction::get_pipeline_stats, endpoints::pipeline_interaction::get_pipeline_metrics, endpoints::pipeline_interaction::get_pipeline_circuit_profile, @@ -571,6 +572,7 @@ fn api_scope() -> Scope { .service(endpoints::pipeline_interaction::post_pipeline_input_connector_action) .service(endpoints::pipeline_interaction::get_pipeline_input_connector_status) .service(endpoints::pipeline_interaction::get_pipeline_output_connector_status) + .service(endpoints::pipeline_interaction::post_pipeline_output_connector_reset) .service(endpoints::pipeline_interaction::get_pipeline_stats) .service(endpoints::pipeline_interaction::get_pipeline_metrics) .service(endpoints::pipeline_interaction::get_pipeline_time_series) From 46a2b6d7b838268c4244148e73d69cd724be35bb Mon Sep 17 00:00:00 2001 From: feldera-bot Date: Sat, 11 Apr 2026 06:23:17 +0000 Subject: [PATCH 11/19] [ci] apply automatic fixes Signed-off-by: feldera-bot --- openapi.json | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/openapi.json b/openapi.json index f5407f2a25e..daf2d49cb75 100644 --- a/openapi.json +++ b/openapi.json @@ -6312,6 +6312,152 @@ ] } }, + "/v0/pipelines/{pipeline_name}/views/{view_name}/connectors/{connector_name}/reset": { + "post": { + "tags": [ + "Output Connectors" + ], + "summary": "Reset Output Connector", + "description": "Reset an output connector configured in `snapshot_and_follow` mode.\n\nThis clears buffered output, asks the sink to reset itself, and then replays\na full snapshot before resuming incremental updates.", + "operationId": "post_pipeline_output_connector_reset", + "parameters": [ + { + "name": "pipeline_name", + "in": "path", + "description": "Unique pipeline name", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "view_name", + "in": "path", + "description": "SQL view name", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "connector_name", + "in": "path", + "description": "Output connector name", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Output connector reset request has been processed" + }, + "400": { + "description": "The output connector does not support reset", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "404": { + "description": "Pipeline, view and/or output connector with that name does not exist", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "examples": { + "Pipeline with that name does not exist": { + "value": { + "message": "Unknown pipeline name 'non-existent-pipeline'", + "error_code": "UnknownPipelineName", + "details": { + "pipeline_name": "non-existent-pipeline" + } + } + } + } + } + } + }, + "500": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "503": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "examples": { + "Disconnected during response": { + "value": { + "message": "Error sending HTTP request to pipeline: the pipeline disconnected while it was processing this HTTP request. This could be because the pipeline either (a) encountered a fatal error or panic, (b) was stopped, or (c) experienced network issues -- retrying might help in the last case. Alternatively, check the pipeline logs. Failed request: /pause pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionUnreachable", + "details": { + "pipeline_name": "my_pipeline", + "request": "/pause", + "error": "the pipeline disconnected while it was processing this HTTP request. This could be because the pipeline either (a) encountered a fatal error or panic, (b) was stopped, or (c) experienced network issues -- retrying might help in the last case. Alternatively, check the pipeline logs." + } + } + }, + "Pipeline is currently unavailable": { + "value": { + "message": "Error sending HTTP request to pipeline: deployment status is currently 'unavailable' -- wait for it to become 'running' or 'paused' again Failed request: /pause pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionUnreachable", + "details": { + "pipeline_name": "my_pipeline", + "request": "/pause", + "error": "deployment status is currently 'unavailable' -- wait for it to become 'running' or 'paused' again" + } + } + }, + "Pipeline is not deployed": { + "value": { + "message": "Unable to interact with pipeline because the deployment status (stopped) indicates it is not (yet) fully provisioned pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionNotDeployed", + "details": { + "pipeline_name": "my_pipeline", + "status": "Stopped", + "desired_status": "Provisioned" + } + } + }, + "Response timeout": { + "value": { + "message": "Error sending HTTP request to pipeline: timeout (10s) was reached: this means the pipeline took too long to respond -- this can simply be because the request was too difficult to process in time, or other reasons (e.g., deadlock): the pipeline logs might contain additional information (original send request error: Timeout while waiting for response) Failed request: /pause pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionUnreachable", + "details": { + "pipeline_name": "my_pipeline", + "request": "/pause", + "error": "timeout (10s) was reached: this means the pipeline took too long to respond -- this can simply be because the request was too difficult to process in time, or other reasons (e.g., deadlock): the pipeline logs might contain additional information (original send request error: Timeout while waiting for response)" + } + } + } + } + } + } + } + }, + "security": [ + { + "JSON web token (JWT) or API key": [] + } + ] + } + }, "/v0/pipelines/{pipeline_name}/views/{view_name}/connectors/{connector_name}/stats": { "get": { "tags": [ From 5e5b2b016b8772db0ed8c076edcf09b45bbc55d4 Mon Sep 17 00:00:00 2001 From: Mihai Budiu Date: Sat, 11 Apr 2026 08:06:15 -0700 Subject: [PATCH 12/19] [PROFILER] Allow searching by a substring of a persistent ID Signed-off-by: Mihai Budiu --- .../src/lib/components/ProfilerLayout.svelte | 4 ++-- js-packages/profiler-lib/src/cytograph.ts | 9 ++++++--- js-packages/profiler-lib/src/profiler.ts | 18 ++++++++++++++++-- js-packages/profiler-lib/src/util.ts | 4 ++++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/js-packages/profiler-layout/src/lib/components/ProfilerLayout.svelte b/js-packages/profiler-layout/src/lib/components/ProfilerLayout.svelte index bd63756cc39..af78ae21492 100644 --- a/js-packages/profiler-layout/src/lib/components/ProfilerLayout.svelte +++ b/js-packages/profiler-layout/src/lib/components/ProfilerLayout.svelte @@ -231,8 +231,8 @@ e.key === 'Enter' && handleSearch()} class="input w-32 text-sm" /> diff --git a/js-packages/profiler-lib/src/cytograph.ts b/js-packages/profiler-lib/src/cytograph.ts index 63446905763..5f8e06176d1 100644 --- a/js-packages/profiler-lib/src/cytograph.ts +++ b/js-packages/profiler-lib/src/cytograph.ts @@ -543,12 +543,15 @@ export class CytographRendering { return this.metadataSelection.metric; } - search(value: string) { + /** Search a node by ID, return 'true' if found. */ + search(value: string): boolean { let el = this.cy.getElementById(value); - if (el === null) { - return; + // el may be an empty collection + if (!el.nonempty()) { + return false; } this.center(Option.some(value)); + return true; } // Layout to use for the first graph rendering diff --git a/js-packages/profiler-lib/src/profiler.ts b/js-packages/profiler-lib/src/profiler.ts index 65b8401c558..39938fa256b 100644 --- a/js-packages/profiler-lib/src/profiler.ts +++ b/js-packages/profiler-lib/src/profiler.ts @@ -281,10 +281,24 @@ export class Visualizer { } /** - * Search for a node by ID + * Search for a node by ID or a substring of the persistent ID. */ search(query: string): void { - this.rendering?.search(query); + // First search by ID + let success = this.rendering?.search(query); + if (success) { + return; + } + if (!this.profile) { + return; + } + + // Find ID of node with given persistent ID + for (const [pid, node] of this.profile.byPersistentId) { + if (pid.includes(query)) { + this.rendering?.search(node.id); + } + } } /** diff --git a/js-packages/profiler-lib/src/util.ts b/js-packages/profiler-lib/src/util.ts index 1394bf90c2d..c318abd5831 100644 --- a/js-packages/profiler-lib/src/util.ts +++ b/js-packages/profiler-lib/src/util.ts @@ -130,6 +130,10 @@ export class OMap { clear(): void { this.map.clear(); } + + [Symbol.iterator](): IterableIterator<[K, V]> { + return this.map[Symbol.iterator](); + } } /** A sublist which includes only specific elements from a list, identified by their indexes. */ From 9dfbb3d090123d388f0663f1abefe6526d87b84a Mon Sep 17 00:00:00 2001 From: Gerd Zellweger Date: Sat, 11 Apr 2026 21:56:26 -0700 Subject: [PATCH 13/19] Revert "[python] Add optional arrow dependency and installation docs" This reverts commit d23c62753bf5f054622cbd1568bd495cf6acff77. --- python/README.md | 6 ---- python/pyproject.toml | 9 +----- python/uv.lock | 66 ------------------------------------------- 3 files changed, 1 insertion(+), 80 deletions(-) diff --git a/python/README.md b/python/README.md index 0693a254194..152765dd2e6 100644 --- a/python/README.md +++ b/python/README.md @@ -12,12 +12,6 @@ The Python SDK documentation is available at: https://docs.feldera.com/python uv pip install feldera ``` -For Arrow IPC query support, install the optional Arrow extra: - -```bash -uv pip install 'feldera[arrow]' -``` - ### Example usage The Python client interacts with the API server of the Feldera instance. diff --git a/python/pyproject.toml b/python/pyproject.toml index bdea1193c4f..d36c73d36e7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -29,12 +29,6 @@ dependencies = [ "ruff>=0.6.9", "PyJWT>=2.12.0", ] - -[project.optional-dependencies] -arrow = [ - "pyarrow>=14.0", -] - [project.urls] Homepage = "https://www.feldera.com" Documentation = "https://docs.feldera.com/python" @@ -49,8 +43,7 @@ dev = [ "sphinx-rtd-theme==2.0.0", "sphinx==7.3.7", "simplejson==3.20.1", - "confluent-kafka>=2.2.0", - "pyarrow>=14.0", + "confluent-kafka>=2.2.0" ] [tool.pytest.ini_options] diff --git a/python/uv.lock b/python/uv.lock index aeb987e2d87..cb5547411af 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -235,15 +235,9 @@ dependencies = [ { name = "typing-extensions" }, ] -[package.optional-dependencies] -arrow = [ - { name = "pyarrow" }, -] - [package.dev-dependencies] dev = [ { name = "confluent-kafka" }, - { name = "pyarrow" }, { name = "pytest" }, { name = "pytest-timeout" }, { name = "pytest-xdist" }, @@ -257,18 +251,15 @@ requires-dist = [ { name = "numpy", specifier = ">=2.2.4" }, { name = "pandas", specifier = ">=2.1.2" }, { name = "pretty-errors" }, - { name = "pyarrow", marker = "extra == 'arrow'", specifier = ">=14.0" }, { name = "pyjwt", specifier = ">=2.12.0" }, { name = "requests" }, { name = "ruff", specifier = ">=0.6.9" }, { name = "typing-extensions" }, ] -provides-extras = ["arrow"] [package.metadata.requires-dev] dev = [ { name = "confluent-kafka", specifier = ">=2.2.0" }, - { name = "pyarrow", specifier = ">=14.0" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-timeout", specifier = ">=2.3.1" }, { name = "pytest-xdist", specifier = ">=3.8.0" }, @@ -715,63 +706,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6d/8e/2df7467a15eae40e26c476683962fdb810cd1b36676603e2f139b4abbeaf/pretty_errors-1.2.25-py3-none-any.whl", hash = "sha256:8ce68ccd99e0f2a099265c8c1f1c23b7c60a15d69bb08816cb336e237d5dc983", size = 17195, upload-time = "2021-11-24T14:32:07.762Z" }, ] -[[package]] -name = "pyarrow" -version = "23.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bc/a8/24e5dc6855f50a62936ceb004e6e9645e4219a8065f304145d7fb8a79d5d/pyarrow-23.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:3fab8f82571844eb3c460f90a75583801d14ca0cc32b1acc8c361650e006fd56", size = 34307390, upload-time = "2026-02-16T10:08:08.654Z" }, - { url = "https://files.pythonhosted.org/packages/bc/8e/4be5617b4aaae0287f621ad31c6036e5f63118cfca0dc57d42121ff49b51/pyarrow-23.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3f91c038b95f71ddfc865f11d5876c42f343b4495535bd262c7b321b0b94507c", size = 35853761, upload-time = "2026-02-16T10:08:17.811Z" }, - { url = "https://files.pythonhosted.org/packages/2e/08/3e56a18819462210432ae37d10f5c8eed3828be1d6c751b6e6a2e93c286a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d0744403adabef53c985a7f8a082b502a368510c40d184df349a0a8754533258", size = 44493116, upload-time = "2026-02-16T10:08:25.792Z" }, - { url = "https://files.pythonhosted.org/packages/f8/82/c40b68001dbec8a3faa4c08cd8c200798ac732d2854537c5449dc859f55a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c33b5bf406284fd0bba436ed6f6c3ebe8e311722b441d89397c54f871c6863a2", size = 47564532, upload-time = "2026-02-16T10:08:34.27Z" }, - { url = "https://files.pythonhosted.org/packages/20/bc/73f611989116b6f53347581b02177f9f620efdf3cd3f405d0e83cdf53a83/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ddf743e82f69dcd6dbbcb63628895d7161e04e56794ef80550ac6f3315eeb1d5", size = 48183685, upload-time = "2026-02-16T10:08:42.889Z" }, - { url = "https://files.pythonhosted.org/packages/b0/cc/6c6b3ecdae2a8c3aced99956187e8302fc954cc2cca2a37cf2111dad16ce/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e052a211c5ac9848ae15d5ec875ed0943c0221e2fcfe69eee80b604b4e703222", size = 50605582, upload-time = "2026-02-16T10:08:51.641Z" }, - { url = "https://files.pythonhosted.org/packages/8d/94/d359e708672878d7638a04a0448edf7c707f9e5606cee11e15aaa5c7535a/pyarrow-23.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:5abde149bb3ce524782d838eb67ac095cd3fd6090eba051130589793f1a7f76d", size = 27521148, upload-time = "2026-02-16T10:08:58.077Z" }, - { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, - { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, - { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, - { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, - { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, - { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, - { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, - { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, - { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, - { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, - { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, - { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, - { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, - { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, - { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" }, - { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" }, - { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" }, - { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" }, - { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" }, - { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" }, - { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" }, - { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" }, - { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" }, - { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" }, - { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" }, - { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" }, - { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" }, - { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" }, - { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" }, - { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" }, - { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" }, - { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" }, - { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" }, - { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" }, - { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" }, - { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" }, - { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" }, - { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" }, - { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" }, - { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" }, - { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" }, -] - [[package]] name = "pygments" version = "2.19.2" From 611b3bb1a752ee18f19f26c019bce282c1168d50 Mon Sep 17 00:00:00 2001 From: Gerd Zellweger Date: Sat, 11 Apr 2026 21:56:33 -0700 Subject: [PATCH 14/19] Revert "[python] Add Arrow IPC query API to client and pipeline" This reverts commit c4c0fe26fb4b7b5da088045f50d20fa5b0351a48. --- python/feldera/pipeline.py | 33 +------------------ python/feldera/rest/feldera_client.py | 47 +-------------------------- 2 files changed, 2 insertions(+), 78 deletions(-) diff --git a/python/feldera/pipeline.py b/python/feldera/pipeline.py index 0f5ef098043..19ce9eb46a4 100644 --- a/python/feldera/pipeline.py +++ b/python/feldera/pipeline.py @@ -4,16 +4,7 @@ from collections import deque from datetime import datetime from threading import Event -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Generator, - List, - Mapping, - Optional, -) +from typing import Any, Callable, Dict, Generator, List, Mapping, Optional from uuid import UUID import pandas @@ -45,9 +36,6 @@ from feldera.stats import InputEndpointStatus, OutputEndpointStatus, PipelineStatistics from feldera.types import CheckpointMetadata -if TYPE_CHECKING: - import pyarrow as pa - class Pipeline: def __init__(self, client: FelderaClient): @@ -989,25 +977,6 @@ def query_parquet(self, query: str, path: str): self.client.query_as_parquet(self.name, query, path) - def query_arrow(self, query: str) -> Generator["pa.RecordBatch", None, None]: - """ - Executes an ad-hoc SQL query on this pipeline and returns a generator - that yields the result as PyArrow RecordBatches. - - Note: - You can only ``SELECT`` from materialized tables and views. - - :param query: The SQL query to be executed. - - :raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED - state. - :raises FelderaAPIError: If querying a non materialized table or view. - :raises FelderaAPIError: If the query is invalid. - - :return: A generator that yields ``pyarrow.RecordBatch`` objects. - """ - return self.client.query_as_arrow(self.name, query) - def query_tabular(self, query: str) -> Generator[str, None, None]: """ Executes a SQL query on this pipeline and returns the result as a diff --git a/python/feldera/rest/feldera_client.py b/python/feldera/rest/feldera_client.py index e9761fed485..7d20e9c878c 100644 --- a/python/feldera/rest/feldera_client.py +++ b/python/feldera/rest/feldera_client.py @@ -3,7 +3,7 @@ import pathlib import time from decimal import Decimal -from typing import TYPE_CHECKING, Any, Dict, Generator, Mapping, Optional +from typing import Any, Dict, Generator, Mapping, Optional from urllib.parse import quote import requests @@ -18,9 +18,6 @@ logger = logging.getLogger(__name__) -if TYPE_CHECKING: - import pyarrow as pa - def _validate_no_none_keys_in_map(data): def validate_no_none_keys(d: Dict[Any, Any]) -> None: @@ -41,17 +38,6 @@ def _prepare_boolean_input(value: bool) -> str: return "true" if value else "false" -def _import_pyarrow_ipc(): - try: - import pyarrow.ipc as ipc - except ImportError as exc: - raise ImportError( - "pyarrow is required for Arrow IPC queries. Install it with `pip install feldera[arrow]`." - ) from exc - - return ipc - - class FelderaClient: """ A client for the Feldera HTTP API. @@ -1231,37 +1217,6 @@ def query_as_parquet(self, pipeline_name: str, query: str, path: str): file.write(chunk) file.close() - def query_as_arrow( - self, pipeline_name: str, query: str - ) -> Generator["pa.RecordBatch", None, None]: - """ - Executes an ad-hoc query on the specified pipeline and returns the result - as a generator that yields PyArrow RecordBatches. - - :param pipeline_name: The name of the pipeline to query. - :param query: The SQL query to be executed. - :return: A generator that yields each query batch as a ``pyarrow.RecordBatch``. - """ - ipc = _import_pyarrow_ipc() - - params = { - "pipeline_name": pipeline_name, - "sql": query, - "format": "arrow_ipc", - } - resp: requests.Response = self.http.get( - path=f"/pipelines/{pipeline_name}/query", - params=params, - stream=True, - ) - - try: - with ipc.open_stream(resp.raw) as reader: - for batch in reader: - yield batch - finally: - resp.close() - def query_as_json( self, pipeline_name: str, query: str ) -> Generator[Mapping[str, Any], None, None]: From 7b150e504e6e45b268a853e939f3fa86cfcde5d4 Mon Sep 17 00:00:00 2001 From: Gerd Zellweger Date: Sat, 11 Apr 2026 21:56:34 -0700 Subject: [PATCH 15/19] Revert "[python] Add tests for Arrow IPC query results" This reverts commit 0be980450ad5ce67622ad79ba3dbee5c26813f7e. --- python/tests/platform/test_shared_pipeline.py | 39 +---- python/tests/unit/test_query_as_arrow.py | 145 ------------------ 2 files changed, 3 insertions(+), 181 deletions(-) delete mode 100644 python/tests/unit/test_query_as_arrow.py diff --git a/python/tests/platform/test_shared_pipeline.py b/python/tests/platform/test_shared_pipeline.py index 6b9cf86cbba..4a06db367be 100644 --- a/python/tests/platform/test_shared_pipeline.py +++ b/python/tests/platform/test_shared_pipeline.py @@ -1,4 +1,3 @@ -import gzip import io import json import os @@ -7,9 +6,9 @@ import time import unittest import zipfile +import gzip import pandas as pd -import pytest from feldera import Pipeline from feldera.enums import CompletionTokenStatus, PipelineFieldSelector, PipelineStatus @@ -168,36 +167,6 @@ def test_adhoc_query_json(self): got = list(resp) self.assertCountEqual(got, expected) - def test_adhoc_query_arrow(self): - pa = pytest.importorskip("pyarrow") - - data = "1\n2\n" - self.pipeline.start() - TEST_CLIENT.push_to_pipeline(self.pipeline.name, "tbl", "csv", data) - - expected_rows = list( - TEST_CLIENT.query_as_json( - self.pipeline.name, - "SELECT * FROM tbl ORDER BY id", - ) - ) - expected_ids = [row["id"] for row in expected_rows] - - batches_client = list( - TEST_CLIENT.query_as_arrow( - self.pipeline.name, - "SELECT * FROM tbl ORDER BY id", - ) - ) - table_client = pa.Table.from_batches(batches_client) - assert table_client.column("id").to_pylist() == expected_ids - - batches_pipeline = list( - self.pipeline.query_arrow("SELECT * FROM tbl ORDER BY id") - ) - table_pipeline = pa.Table.from_batches(batches_pipeline) - assert table_pipeline.column("id").to_pylist() == expected_ids - def test_local(self): """ CREATE TABLE students ( @@ -378,10 +347,8 @@ def test_failed_pipeline_stop(self): self.pipeline.input_json("tbl", data, wait=False) wait_for_condition( "pipeline stops with deployment error after worker panic", - lambda: ( - self.pipeline.status() == PipelineStatus.STOPPED - and len(self.pipeline.deployment_error()) > 0 - ), + lambda: self.pipeline.status() == PipelineStatus.STOPPED + and len(self.pipeline.deployment_error()) > 0, timeout_s=20.0, poll_interval_s=1.0, ) diff --git a/python/tests/unit/test_query_as_arrow.py b/python/tests/unit/test_query_as_arrow.py deleted file mode 100644 index b90d228c12a..00000000000 --- a/python/tests/unit/test_query_as_arrow.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Unit tests for FelderaClient.query_as_arrow and Pipeline.query_arrow.""" - -import builtins -import io -import sys -from unittest.mock import MagicMock - -import pytest - -from feldera.rest.feldera_client import FelderaClient - - -def _import_arrow_modules(): - pa = pytest.importorskip("pyarrow") - ipc = pytest.importorskip("pyarrow.ipc") - return pa, ipc - - -def _make_ipc_bytes(table) -> bytes: - """Serialise a ``pyarrow.Table`` to Arrow IPC stream bytes.""" - _, ipc = _import_arrow_modules() - buf = io.BytesIO() - with ipc.new_stream(buf, table.schema) as writer: - if table.num_rows > 0: - writer.write_table(table) - return buf.getvalue() - - -def _mock_response(ipc_bytes: bytes) -> MagicMock: - """Return a mock response whose ``raw`` is an Arrow IPC byte stream.""" - resp = MagicMock() - resp.raw = io.BytesIO(ipc_bytes) - return resp - - -@pytest.fixture() -def client() -> FelderaClient: - """A ``FelderaClient`` with a mocked HTTP layer (no real network calls).""" - c = FelderaClient.__new__(FelderaClient) - c.http = MagicMock() - return c - - -class TestQueryAsArrow: - def test_non_empty_result_yields_correct_data(self, client: FelderaClient): - pa, _ = _import_arrow_modules() - schema = pa.schema([("id", pa.int64()), ("name", pa.utf8())]) - expected = pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]}, schema=schema) - client.http.get.return_value = _mock_response(_make_ipc_bytes(expected)) - - batches = list(client.query_as_arrow("my_pipeline", "SELECT id, name FROM t")) - result = pa.Table.from_batches(batches, schema=schema) - - assert len(batches) > 0 - assert result.schema == schema - assert result.num_rows == 3 - assert result.column("id").to_pylist() == [1, 2, 3] - assert result.column("name").to_pylist() == ["a", "b", "c"] - - def test_http_called_with_correct_params(self, client: FelderaClient): - pa, _ = _import_arrow_modules() - schema = pa.schema([("id", pa.int64())]) - table = pa.table({"id": [42]}, schema=schema) - client.http.get.return_value = _mock_response(_make_ipc_bytes(table)) - - list(client.query_as_arrow("my_pipeline", "SELECT id FROM t")) - - client.http.get.assert_called_once_with( - path="/pipelines/my_pipeline/query", - params={ - "pipeline_name": "my_pipeline", - "sql": "SELECT id FROM t", - "format": "arrow_ipc", - }, - stream=True, - ) - - def test_empty_result_yields_no_batches(self, client: FelderaClient): - pa, _ = _import_arrow_modules() - schema = pa.schema([("id", pa.int64()), ("value", pa.float64())]) - empty = pa.table( - { - "id": pa.array([], type=pa.int64()), - "value": pa.array([], type=pa.float64()), - }, - schema=schema, - ) - client.http.get.return_value = _mock_response(_make_ipc_bytes(empty)) - - result_batches = list( - client.query_as_arrow("my_pipeline", "SELECT id, value FROM t WHERE false") - ) - - assert result_batches == [] - - def test_missing_pyarrow_raises_helpful_import_error( - self, client: FelderaClient, monkeypatch - ): - real_import = builtins.__import__ - - def _import(name, globals=None, locals=None, fromlist=(), level=0): - if name == "pyarrow" or name.startswith("pyarrow."): - raise ImportError("No module named 'pyarrow'") - return real_import(name, globals, locals, fromlist, level) - - monkeypatch.delitem(sys.modules, "pyarrow", raising=False) - monkeypatch.delitem(sys.modules, "pyarrow.ipc", raising=False) - monkeypatch.setattr(builtins, "__import__", _import) - - with pytest.raises(ImportError, match="pip install feldera\\[arrow\\]"): - next(client.query_as_arrow("my_pipeline", "SELECT 1")) - - client.http.get.assert_not_called() - - def test_response_closed_after_full_consumption(self, client: FelderaClient): - pa, _ = _import_arrow_modules() - schema = pa.schema([("id", pa.int64())]) - table = pa.table({"id": [1, 2]}, schema=schema) - resp = _mock_response(_make_ipc_bytes(table)) - client.http.get.return_value = resp - - list(client.query_as_arrow("my_pipeline", "SELECT id FROM t")) - - resp.close.assert_called_once() - - -class TestPipelineQueryArrow: - def test_query_arrow_delegates_to_client(self): - """Pipeline.query_arrow must forward to client.query_as_arrow.""" - from feldera.pipeline import Pipeline - - pipeline = Pipeline.__new__(Pipeline) - pipeline._inner = MagicMock() - pipeline._inner.name = "pipe1" - pipeline.client = MagicMock() - - expected = object() - pipeline.client.query_as_arrow.return_value = expected - - result = pipeline.query_arrow("SELECT x FROM v") - - pipeline.client.query_as_arrow.assert_called_once_with( - "pipe1", "SELECT x FROM v" - ) - assert result is expected From dbb8fbad4deed94983360226236dcb54e6a2ca4e Mon Sep 17 00:00:00 2001 From: Mihai Budiu Date: Sat, 11 Apr 2026 05:55:40 -0700 Subject: [PATCH 16/19] [SQL] Fix crash in waterline computation Signed-off-by: Mihai Budiu --- .../outer/monotonicity/InsertLimiters.java | 7 ++- .../simple/IncrementalRegression2Tests.java | 49 +++++++++++++++++++ 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/visitors/outer/monotonicity/InsertLimiters.java b/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/visitors/outer/monotonicity/InsertLimiters.java index 60f51a0bbea..33680492eaa 100644 --- a/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/visitors/outer/monotonicity/InsertLimiters.java +++ b/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/visitors/outer/monotonicity/InsertLimiters.java @@ -1280,7 +1280,7 @@ public void postorder(DBSPJoinFilterMapOperator join) { List monotoneFields = new ArrayList<>(); for (int varIndex = 0, field = 0; field < leftValueSize; field++) { int firstOutputField = iomap.firstOutputField(1, field); - Utilities.enforce(firstOutputField >= 0); + if (firstOutputField < 0) continue; // Field not used in the output IMaybeMonotoneType compareField = filterTuple.getField(firstOutputField); value.add(compareField); if (compareField.mayBeMonotone()) { @@ -1305,7 +1305,6 @@ public void postorder(DBSPJoinFilterMapOperator join) { } } - // Exact same procedure on the right hand side OutputPort rightLimiter = null; DBSPSimpleOperator rightMonotone = null; if (expansion.rightMap != null) { @@ -1330,7 +1329,7 @@ public void postorder(DBSPJoinFilterMapOperator join) { int varIndex = 0; for (int field = 0; field < leftValueSize; field++) { int firstOutputField = iomap.firstOutputField(1, field); - Utilities.enforce(firstOutputField >= 0); + if (firstOutputField < 0) continue; // field not used in the output IMaybeMonotoneType compareField = filterTuple.getField(firstOutputField); if (compareField.mayBeMonotone()) { varIndex++; @@ -1339,7 +1338,7 @@ public void postorder(DBSPJoinFilterMapOperator join) { for (int field = 0; field < rightValueSize; field++) { int firstOutputField = iomap.firstOutputField(2, field); - Utilities.enforce(firstOutputField >= 0); + if (firstOutputField < 0) continue; // field not used in the output IMaybeMonotoneType compareField = filterTuple.getField(firstOutputField); value.add(compareField); if (compareField.mayBeMonotone()) { diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/IncrementalRegression2Tests.java b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/IncrementalRegression2Tests.java index 045f495cac5..9e50d0c07f9 100644 --- a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/IncrementalRegression2Tests.java +++ b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/IncrementalRegression2Tests.java @@ -17,6 +17,7 @@ public CompilerOptions testOptions() { options.languageOptions.incrementalize = true; options.languageOptions.optimizationLevel = 2; options.languageOptions.ignoreOrderBy = true; + options.ioOptions.quiet = false; return options; } @@ -563,4 +564,52 @@ public void endVisit() { } }); } + + @Test + public void failInLateness() { + this.getCC(""" + CREATE TABLE ID_0 ( + ID_1 VARCHAR(90) NOT NULL PRIMARY KEY, + ID_2 INTEGER NOT NULL, + ID_3 VARCHAR(60) NOT NULL, + ID_6 TIMESTAMP NOT NULL, + ID_7 VARCHAR(60) NOT NULL + ); + CREATE TABLE ID_20 ( + ID_1 VARCHAR(110) NOT NULL PRIMARY KEY, + ID_2 SMALLINT NOT NULL, + ID_21 VARCHAR(64), + ID_22 DECIMAL(38, 0), + ID_23 INTEGER, + ID_24 BIGINT NOT NULL PRIMARY KEY LATENESS 216000 :: BIGINT, + ID_25 TIMESTAMP NOT NULL + ); + + CREATE LOCAL VIEW ID_26 AS + SELECT ID_2, ID_3 AS ID_21, CAST(ID_7 AS BIGINT UNSIGNED) AS ID_27, ID_6 AS ID_28 + FROM ID_0; + + CREATE LOCAL VIEW ID_36 AS + SELECT + ID_30.ID_2, + ID_30.ID_21, + SUM(CASE WHEN ID_30.ID_23 = 1 THEN ID_30.ID_22 ELSE 0 END) AS ID_37, + SUM(CASE WHEN ID_30.ID_23 = 2 THEN ID_30.ID_22 ELSE 0 END) AS ID_38, + MAX(ID_30.ID_24) AS ID_33, MAX(ID_30.ID_25) AS ID_34 + FROM ID_20 AS ID_30 + INNER JOIN ID_26 AS ID_35 ON ID_30.ID_2 = ID_35.ID_2 AND ID_30.ID_21 = ID_35.ID_21 + WHERE ID_30.ID_25 > ID_35.ID_28 + GROUP BY ID_30.ID_2, ID_30.ID_21; + + CREATE VIEW ID_39 AS + SELECT + ID_35.ID_2, + ID_35.ID_21, + COALESCE(ID_41.ID_37, 0) AS ID_31, + COALESCE(ID_41.ID_38, 0) AS ID_32, + COALESCE(ID_41.ID_33, 0) AS ID_33, + ID_35.ID_27 + COALESCE(ID_41.ID_37, 0) - COALESCE(ID_41.ID_38, 0) AS ID_40 + FROM ID_26 AS ID_35 + LEFT JOIN ID_36 AS ID_41 ON ID_35.ID_2 = ID_41.ID_2 AND ID_35.ID_21 = ID_41.ID_21;"""); + } } From 77083a511fd095157422099aae8136bd326b6737 Mon Sep 17 00:00:00 2001 From: Mihai Budiu Date: Sat, 11 Apr 2026 07:18:20 -0700 Subject: [PATCH 17/19] [SQL] Fix incorrect documentation for TO_HEX function Signed-off-by: Mihai Budiu --- .../frontend/calciteCompiler/CalciteFunctions.java | 3 ++- .../frontend/calciteCompiler/CustomFunctions.java | 11 +++++++++-- .../compiler/sql/simple/Regression2Tests.java | 9 +++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CalciteFunctions.java b/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CalciteFunctions.java index 67cea5a32fd..dcccee05ad6 100644 --- a/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CalciteFunctions.java +++ b/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CalciteFunctions.java @@ -370,7 +370,8 @@ record Func(SqlOperator function, String functionName, SqlLibrary library, new Func(SqlLibraryOperators.LENGTH, "LENGTH", SqlLibrary.POSTGRESQL, "string#char_length", FunctionDocumentation.NO_FILE, false), new Func(SqlLibraryOperators.SUBSTR_BIG_QUERY, "SUBSTR", SqlLibrary.BIG_QUERY, "string#substr", FunctionDocumentation.NO_FILE, false), new Func(SqlLibraryOperators.SPLIT, "SPLIT", SqlLibrary.BIG_QUERY, "string#split", FunctionDocumentation.NO_FILE, false), - new Func(SqlLibraryOperators.SPLIT_PART, "SPLIT_PART", SqlLibrary.POSTGRESQL, "string#split_part", FunctionDocumentation.NO_FILE, false), + // https://issues.apache.org/jira/browse/CALCITE-7468 + // new Func(SqlLibraryOperators.SPLIT_PART, "SPLIT_PART", SqlLibrary.POSTGRESQL, "string#split_part", FunctionDocumentation.NO_FILE, false), new Func(SqlLibraryOperators.GREATEST, "GREATEST", SqlLibrary.BIG_QUERY, "comparisons#greatest", FunctionDocumentation.NO_FILE, false), new Func(SqlLibraryOperators.LEAST, "LEAST", SqlLibrary.BIG_QUERY, "comparisons#least", FunctionDocumentation.NO_FILE, false), new Func(SqlLibraryOperators.SAFE_CAST, "SAFE_CAST", SqlLibrary.BIG_QUERY, "casts#safe-casts", FunctionDocumentation.NO_FILE, false), diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CustomFunctions.java b/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CustomFunctions.java index 84fb9ebc512..5490000aa23 100644 --- a/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CustomFunctions.java +++ b/sql-to-dbsp-compiler/SQL-compiler/src/main/java/org/dbsp/sqlCompiler/compiler/frontend/calciteCompiler/CustomFunctions.java @@ -66,8 +66,10 @@ public CustomFunctions() { this.functions.add(new ArrayTransformFunction()); this.functions.add(new ArrayUnion()); this.functions.add(new ArraysOverlapFunction()); + this.functions.add(new Bin2Utf8Function()); this.functions.add(new BlackboxFunction()); this.functions.add(new BroundFunction()); + this.functions.add(new ConnectorMetadataFunction()); this.functions.add(new ConvertTimezoneFunction()); this.functions.add(new FormatDateFunction()); this.functions.add(new FormatTimestampFunction()); @@ -81,12 +83,11 @@ public CustomFunctions() { this.functions.add(new ParseJsonFunction()); this.functions.add(new ParseTimeFunction()); this.functions.add(new ParseTimestampFunction()); - this.functions.add(new Bin2Utf8Function()); this.functions.add(new RlikeFunction()); this.functions.add(new SequenceFunction()); + this.functions.add(new SplitPartFunction()); this.functions.add(new ToIntFunction()); this.functions.add(new ToJsonFunction()); - this.functions.add(new ConnectorMetadataFunction()); this.functions.add(new WriteLogFunction()); this.udf = new HashMap<>(); this.aggregates = new HashMap<>(); @@ -194,6 +195,12 @@ private FormatTimestampFunction() { } } + static class SplitPartFunction extends CalciteFunctionClone { + private SplitPartFunction() { + super(SqlLibraryOperators.SPLIT_PART, "string#split_part", FunctionDocumentation.NO_FILE); + } + } + static class FormatTimeFunction extends CalciteFunctionClone { private FormatTimeFunction() { super(SqlLibraryOperators.FORMAT_TIME, "datetime#format_time", FunctionDocumentation.NO_FILE); diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java index cb7470213f8..fc8df9e2e0a 100644 --- a/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java +++ b/sql-to-dbsp-compiler/SQL-compiler/src/test/java/org/dbsp/sqlCompiler/compiler/sql/simple/Regression2Tests.java @@ -919,4 +919,13 @@ public void issue5981() { 0abc (1 row)"""); } + + @Test + public void issue5982() { + this.q(""" + SELECT SPLIT_PART('11.12.13', '.', 2); + r + ---- + 12"""); + } } From ec739d981ddd2c7c67cb229a0dac49373f61e549 Mon Sep 17 00:00:00 2001 From: Jyotshna Yaparla Date: Sun, 12 Apr 2026 12:09:23 -0400 Subject: [PATCH 18/19] ci: abort Docker build job when workflow is cancelled --- .github/workflows/ci.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7165ec1a008..466724291ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,11 +56,6 @@ jobs: invoke-build-docker: name: Build Docker needs: [invoke-build-rust, invoke-build-java, invoke-tests-web-console-unit] - if: | - always() && - (needs.invoke-build-rust.result == 'success' || needs.invoke-build-rust.result == 'skipped') && - (needs.invoke-build-java.result == 'success' || needs.invoke-build-java.result == 'skipped') && - (needs.invoke-tests-web-console-unit.result == 'success' || needs.invoke-tests-web-console-unit.result == 'skipped') uses: ./.github/workflows/build-docker.yml secrets: inherit From c0f7eae9bde9b0856aa871c8fac719b8872e30a6 Mon Sep 17 00:00:00 2001 From: Leonid Ryzhyk Date: Wed, 8 Apr 2026 17:33:44 -0700 Subject: [PATCH 19/19] [adapters] Delta input: revamp error handling and retry logic. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The connector already had retry logic in some places, but mostly relied on delta-rs for retries. This wasn't always enough and we saw timeouts and expired token errors bubbling up. This commit adds retry loops around all object store accesses. The loops are controlled by the new `max_retries` setting, similar to the output connector. By default, it will retry forever. The retry loops set health status to UNHEALTHY while retrying. If the pipeline is stopped and restarted during a retry, the connector resumes from the last successfully ingested table version. After exhausting retry attempts the connector fails permanently with a fatal error, which eliminates the possibility of data loss. There is an important caveat: Because retries may occur after partial progress (e.g., after partially processing a Delta log entry), the same data may be ingested more than once. This is consistent with the connector’s at-least-once delivery guarantee. Signed-off-by: Leonid Ryzhyk --- crates/adapterlib/src/transport.rs | 4 + crates/adapters/src/controller.rs | 5 + .../src/integrated/delta_table/input.rs | 487 +++++++++++++----- .../src/integrated/delta_table/test.rs | 152 +++++- .../adapters/src/test/mock_input_consumer.rs | 3 + .../adapters/src/transport/kafka/ft/test.rs | 3 + .../src/transport/delta_table.rs | 20 + docs.feldera.com/docs/changelog.md | 15 +- .../docs/connectors/sources/delta.md | 36 ++ openapi.json | 7 + 10 files changed, 598 insertions(+), 134 deletions(-) diff --git a/crates/adapterlib/src/transport.rs b/crates/adapterlib/src/transport.rs index d2cc203a96a..1c357dd6ad5 100644 --- a/crates/adapterlib/src/transport.rs +++ b/crates/adapterlib/src/transport.rs @@ -1,6 +1,7 @@ use anyhow::{Error as AnyError, Result as AnyResult}; use chrono::{DateTime, Utc}; use dyn_clone::DynClone; +use feldera_types::adapter_stats::ConnectorHealth; use feldera_types::config::FtModel; use feldera_types::program_schema::Relation; use rmpv::{Value as RmpValue, ext::Error as RmpDecodeError}; @@ -773,6 +774,9 @@ pub trait InputConsumer: Send + Sync + DynClone { /// Optional tag that can be used for additional context /// e.g. for rate limiting fn error(&self, fatal: bool, error: AnyError, tag: Option<&'static str>); + + /// Updates the health status of the connector. + fn update_connector_health(&self, health: ConnectorHealth); } /// Information needed to restart after or replay input. diff --git a/crates/adapters/src/controller.rs b/crates/adapters/src/controller.rs index 33575fec75b..fc6e4399357 100644 --- a/crates/adapters/src/controller.rs +++ b/crates/adapters/src/controller.rs @@ -7096,6 +7096,11 @@ impl InputConsumer for InputProbe { .status .set_custom_metrics(self.endpoint_id, metrics); } + + fn update_connector_health(&self, health: ConnectorHealth) { + self.controller + .update_input_connector_health(self.endpoint_id, health); + } } /// An output probe inserted between the encoder and the output transport diff --git a/crates/adapters/src/integrated/delta_table/input.rs b/crates/adapters/src/integrated/delta_table/input.rs index 0d276f6fe24..d5132ed7c24 100644 --- a/crates/adapters/src/integrated/delta_table/input.rs +++ b/crates/adapters/src/integrated/delta_table/input.rs @@ -33,14 +33,18 @@ use feldera_adapterlib::utils::datafusion::{ validate_sql_expression, validate_timestamp_column, }; use feldera_storage::tokio::TOKIO_DEDICATED_IO; +use feldera_types::adapter_stats::ConnectorHealth; use feldera_types::config::FtModel; use feldera_types::program_schema::Relation; use feldera_types::transport::delta_table::{DeltaTableReaderConfig, DeltaTableTransactionMode}; use futures_util::StreamExt; +use rand::Rng; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use std::cmp::min; use std::collections::{BTreeSet, HashMap}; +use std::fmt::Display; +use std::future::Future; use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::thread; @@ -56,12 +60,18 @@ use url::Url; const POLL_INTERVAL: Duration = Duration::from_millis(1000); /// Calculate exponential backoff delay for retrying delta log reads. -/// Starts at 0.5s, doubles each retry, caps at 32s. +/// Starts at 0.5s, doubles each retry, caps at 32s, plus uniform jitter up to 25% of that delay +/// (capped at `max_delay_ms`) to reduce synchronized retries. fn calculate_backoff_delay(retry_count: u32) -> Duration { - let base_delay_ms = 500; // 0.5 seconds - let max_delay_ms = 32_000; // 32 seconds - let delay_ms = std::cmp::min(base_delay_ms << retry_count, max_delay_ms); - Duration::from_millis(delay_ms) + let base_delay_ms: u64 = 500; // 0.5 seconds + let max_delay_ms: u64 = 32_000; // 32 seconds + let delay_ms = min( + base_delay_ms.checked_shl(retry_count).unwrap_or(u64::MAX), + max_delay_ms, + ); + let jitter_span = (delay_ms / 4).max(1); + let jitter_ms = rand::thread_rng().gen_range(0..jitter_span); + Duration::from_millis(min(delay_ms + jitter_ms, max_delay_ms)) } /// Default object store timeout. When not explicitly set by the user, @@ -83,7 +93,7 @@ static DELTA_READER_SEMAPHORE: std::sync::LazyLock = /// Used to detect conflicting values of `max_concurrent_readers`. static MAX_CONCURRENT_READERS_SET: AtomicBool = AtomicBool::new(false); -/// Takes a column name from a DeltaLake schema and returns a qouted string +/// Takes a column name from a DeltaLake schema and returns a quoted string /// that can be used in datafusion queries like `select "foo""bar" from my_table`. fn quote_sql_identifier>(ident: S) -> String { format!("\"{}\"", ident.as_ref().replace("\"", "\"\"")) @@ -274,9 +284,11 @@ impl DeltaTableInputReader { // status for the frontier will be set to the current time instead of whenever the table version in resume_info // was actually processed. The right solution is to checkpoint the frontier with the connector. if resume_info.is_some() { - endpoint - .queue - .push_with_aux((None, Vec::new()), Utc::now(), resume_info); + endpoint.queue.push_with_aux( + (None, Vec::new()), + Utc::now(), + QueueEntry::ResumeInfo(resume_info), + ); } if eoi { @@ -347,18 +359,34 @@ impl InputReader for DeltaTableInputReader { checkpoint_requested, } => { // When initiating a checkpoint, try to stop at a delta table transaction boundary. - let stop_at: &dyn Fn(&Option) -> bool = if checkpoint_requested { - &|resume_info: &Option| resume_info.is_some() + let stop_at: &dyn Fn(&QueueEntry) -> bool = if checkpoint_requested { + &|entry: &QueueEntry| { + matches!( + entry, + QueueEntry::ResumeInfo(Some(_)) | QueueEntry::Rollback + ) + } } else { - &|_: &Option| false + &|_: &QueueEntry| false }; let (total, _, resume_info) = self.inner.queue.flush_with_aux_until(stop_at); - let resume_status = resume_info - .last() - .map(|(_ts, resume_info)| resume_info.clone()) - .unwrap_or_else(|| self.inner.last_resume_status.lock().unwrap().clone()); + let resume_status = match resume_info.last() { + None => self.inner.last_resume_status.lock().unwrap().clone(), + Some((_ts, QueueEntry::ResumeInfo(resume_info))) => resume_info.clone(), + Some((_ts, QueueEntry::Rollback)) => Some( + self.inner + .last_checkpointable_status + .lock() + .unwrap() + .clone(), + ), + }; + *self.inner.last_resume_status.lock().unwrap() = resume_status.clone(); + if let Some(resume_status) = &resume_status { + *self.inner.last_checkpointable_status.lock().unwrap() = resume_status.clone(); + } let resume = match resume_status { None => Resume::Barrier, @@ -371,11 +399,12 @@ impl InputReader for DeltaTableInputReader { Some(resume), resume_info .into_iter() - .map(|(timestamp, metadata)| { - Watermark::new( + .filter_map(|(timestamp, metadata)| match metadata { + QueueEntry::ResumeInfo(resume_info) => Some(Watermark::new( timestamp, - metadata.map(|m| serde_json::to_value(m).unwrap()), - ) + resume_info.map(|m| serde_json::to_value(m).unwrap()), + )), + QueueEntry::Rollback => None, }) .collect(), ); @@ -550,10 +579,28 @@ struct DeltaTableInputEndpointInner { /// * Updated to `Some(new_version)` after advancing to the next table version in the transaction log /// in follow mode or after ingesting the initial snapshot. last_resume_status: Mutex>, - queue: Arc, StagedInputBuffer>>, + + /// The latest checkpointable status of this endpoint. + last_checkpointable_status: Mutex, + + queue: Arc>, metrics: Arc, } +#[derive(Debug, Clone)] +enum QueueEntry { + /// Resume info for the connector after processing this queue entry. + ResumeInfo(Option), + + /// Sent after failing to read a delta log entry, before retrying or + /// declaring a fatal error. Makes sure that the connector can be checkpointed + /// between retries. + /// + /// Note: this is not the actual transaction rollback: the current transaction, + /// if any, will be committed. + Rollback, +} + impl DeltaTableInputEndpointInner { fn new( endpoint_name: &str, @@ -573,6 +620,8 @@ impl DeltaTableInputEndpointInner { let metrics = Arc::new(DeltaTableMetrics::new()); consumer.set_custom_metrics(Arc::clone(&metrics) as Arc); + let resume_status = resume_info.unwrap_or_else(DeltaResumeInfo::initial); + Self { endpoint_name: endpoint_name.to_string(), schema, @@ -582,9 +631,9 @@ impl DeltaTableInputEndpointInner { transaction_index: AtomicUsize::new(0), // Set version to None by default so that the connector is checkpointable in the initial state. - last_resume_status: Mutex::new(Some( - resume_info.unwrap_or_else(DeltaResumeInfo::initial), - )), + last_resume_status: Mutex::new(Some(resume_status.clone())), + + last_checkpointable_status: Mutex::new(resume_status), queue, metrics, } @@ -737,12 +786,16 @@ impl DeltaTableInputEndpointInner { /// Load the entire table snapshot as a single "select * where " query. /// Returns the total number of records processed. + /// + /// Fails with an error if the function fails to read the snapshot. This function + /// doesn't retry (the idea being that the snapshot can be large, and it's better to + /// fail fast and give the user a chance to restart the pipeline). async fn read_unordered_snapshot( &self, table: &DeltaTable, input_stream: &mut dyn ArrowStream, receiver: &mut Receiver, - ) -> usize { + ) -> AnyResult { let column_names = self.used_column_list(table); let mut snapshot_query = format!("select {column_names} from snapshot"); @@ -759,8 +812,14 @@ impl DeltaTableInputEndpointInner { let timestamp = Utc::now(); let record_count = self - .execute_snapshot_query(&snapshot_query, "initial snapshot", input_stream, receiver) - .await; + .execute_snapshot_query( + &snapshot_query, + "initial snapshot", + input_stream, + receiver, + self.config.max_retries(), + ) + .await?; self.metrics .snapshot_records_total .fetch_add(record_count as u64, Ordering::Relaxed); @@ -769,11 +828,11 @@ impl DeltaTableInputEndpointInner { self.queue.push_entry( InputQueueEntry::new_with_aux( timestamp, - Some(DeltaResumeInfo::follow_mode( + QueueEntry::ResumeInfo(Some(DeltaResumeInfo::follow_mode( // We verified that the table version is not None in the open_table method. table.version().unwrap(), !self.config.follow(), - )), + ))), ) // If we started a transaction while processing the snapshot, commit it now. .with_commit_transaction(true), @@ -787,38 +846,37 @@ impl DeltaTableInputEndpointInner { record_count, table.version().unwrap() ); - record_count + Ok(record_count) } /// Load the initial snapshot by issuing a sequence of queries for monotonically /// increasing timestamp ranges. /// Returns the total number of records processed. + /// + /// Fails with an error if the function fails to complete one of the range queries + /// after retrying `self.config.max_retries` times. async fn read_ordered_snapshot( &self, table: &DeltaTable, input_stream: &mut dyn ArrowStream, receiver: &mut Receiver, - ) -> usize { + ) -> AnyResult { // Use the time when we started reading the snapshot as the ingestion timestamp for the snapshot. let timestamp = Utc::now(); let total_records = self .read_ordered_snapshot_inner(table, input_stream, receiver) - .await - .unwrap_or_else(|e| { - self.consumer.error(true, e, None); - 0 - }); + .await?; // Empty buffer to indicate checkpointable state. self.queue.push_entry( InputQueueEntry::new_with_aux( timestamp, - Some(DeltaResumeInfo::follow_mode( + QueueEntry::ResumeInfo(Some(DeltaResumeInfo::follow_mode( // We verified that the table version is not None in the open_table method. table.version().unwrap(), !self.config.follow(), - )), + ))), ) // If we started a transaction while processing the snapshot, commit it now. .with_commit_transaction(true), @@ -831,7 +889,7 @@ impl DeltaTableInputEndpointInner { total_records, table.version().unwrap() ); - total_records + Ok(total_records) } async fn read_ordered_snapshot_inner( @@ -940,8 +998,14 @@ impl DeltaTableInputEndpointInner { } let range_record_count = self - .execute_snapshot_query(&range_query, "range", input_stream, receiver) - .await; + .execute_snapshot_query( + &range_query, + "range", + input_stream, + receiver, + self.config.max_retries(), + ) + .await?; self.metrics .snapshot_records_total .fetch_add(range_record_count as u64, Ordering::Relaxed); @@ -962,11 +1026,11 @@ impl DeltaTableInputEndpointInner { self.queue.push_entry( InputQueueEntry::new_with_aux( Utc::now(), - Some(DeltaResumeInfo::snapshot_mode( + QueueEntry::ResumeInfo(Some(DeltaResumeInfo::snapshot_mode( // We verified that the table version is not None in the open_table method. table.version().unwrap(), &start, - )), + ))), ) // If we started a transaction while processing the range query, commit it now. .with_commit_transaction(true), @@ -977,6 +1041,55 @@ impl DeltaTableInputEndpointInner { Ok(total_records) } + /// Runs `operation` until it succeeds or [`DeltaTableReaderConfig::max_retries`] is exhausted. + /// + /// On failure before the limit: sets connector health to unhealthy, logs a warning with + /// `description` as the message prefix, sleeps using [`calculate_backoff_delay`], then retries. + /// On final failure: updates health, invokes [`InputConsumer::error`], and returns `Err`. + async fn retry( + &self, + description: &str, + error_tag: Option<&'static str>, + mut operation: F, + ) -> Result + where + F: FnMut() -> Fut, + Fut: Future>, + E: Display, + { + let max_retries = self.config.max_retries(); + let mut retry_count = 0u32; + loop { + match operation().await { + Ok(value) => { + self.consumer + .update_connector_health(ConnectorHealth::healthy()); + return Ok(value); + } + Err(e) => { + retry_count += 1; + if retry_count - 1 == max_retries { + let message = format!("{description} after {retry_count} attempts: {e}"); + self.consumer + .update_connector_health(ConnectorHealth::unhealthy(&message)); + self.consumer + .error(true, anyhow!(message.clone()), error_tag); + return Err(anyhow!(message)); + } + let backoff_delay = calculate_backoff_delay(retry_count - 1); + let message = format!( + "{description} after {retry_count} attempts: {e}; retrying in {:?}", + backoff_delay + ); + self.consumer + .update_connector_health(ConnectorHealth::unhealthy(&message)); + warn!("delta_table {}: {message}", &self.endpoint_name); + sleep(backoff_delay).await; + } + } + } + } + async fn worker_task_inner( self: Arc, mut input_stream: Box, @@ -1032,19 +1145,28 @@ impl DeltaTableInputEndpointInner { }) ); - let mut snapshot_record_count = 0usize; - - if snapshot_incomplete && self.config.snapshot() && self.config.timestamp_column.is_none() { + let snapshot_record_count = if snapshot_incomplete + && self.config.snapshot() + && self.config.timestamp_column.is_none() + { // Read snapshot chunk-by-chunk. - snapshot_record_count = self - .read_unordered_snapshot(&table, input_stream.as_mut(), &mut receiver) - .await; + self.read_unordered_snapshot(&table, input_stream.as_mut(), &mut receiver) + .await } else if snapshot_incomplete && self.config.snapshot() { // Read the entire snapshot in one query. - snapshot_record_count = self - .read_ordered_snapshot(&table, input_stream.as_mut(), &mut receiver) - .await; - } + self.read_ordered_snapshot(&table, input_stream.as_mut(), &mut receiver) + .await + } else { + Ok(0) + }; + + let snapshot_record_count = match snapshot_record_count { + Ok(snapshot_record_count) => snapshot_record_count, + Err(e) => { + self.consumer.error(true, e, None); + return; + } + }; // Start following the table if required by the configuration. if self.config.follow() { @@ -1074,8 +1196,6 @@ impl DeltaTableInputEndpointInner { // Note: If self.config.snapshot() && !snapshot_incomplete, we're resuming from a checkpoint // where the snapshot was already completed, so no special log needed - let mut retry_count = 0; - // If we haven't previously read a snapshot of the table, report initial frontier. // This makes sure that even if the current version of the table is the final version, // we will report the frontier. @@ -1083,7 +1203,7 @@ impl DeltaTableInputEndpointInner { self.queue.push_with_aux( (None, Vec::new()), Utc::now(), - Some(DeltaResumeInfo::follow_mode(version, false)), + QueueEntry::ResumeInfo(Some(DeltaResumeInfo::follow_mode(version, false))), ); } @@ -1091,36 +1211,59 @@ impl DeltaTableInputEndpointInner { wait_running(&mut receiver).await; let new_version = version + 1; - match table.log_store().read_commit_entry(new_version).await { - Ok(None) => sleep(POLL_INTERVAL).await, - Ok(Some(bytes)) + let table_for_retry = Arc::clone(&table); + let entry = match self + .retry( + &format!( + "error reading the next log entry (current table version: {version})" + ), + Some("delta-next-log"), + move || { + let table = Arc::clone(&table_for_retry); + async move { table.log_store().read_commit_entry(new_version).await } + }, + ) + .await + { + Ok(entry) => entry, + Err(_) => break, + }; + + match entry { + None => sleep(POLL_INTERVAL).await, + Some(bytes) if self.config.end_version.is_none() || self.config.end_version.unwrap() >= new_version => { - retry_count = 0; - let actions = match logstore::get_actions(new_version, &bytes) { Ok(actions) => actions, Err(e) => { self.consumer.error( true, - anyhow!("error parsing log entry for table version {new_version}: {e}"), - Some("delta-parse-log") + anyhow!( + "error parsing log entry for table version {new_version}: {e}" + ), + None, ); break; } }; version = new_version; - self.process_log_entry( - new_version, - &actions, - &table, - cdc_delete_filter.clone(), - input_stream.as_mut(), - &mut receiver, - ) - .await; + if let Err(e) = self + .process_log_entry( + new_version, + &actions, + &table, + cdc_delete_filter.clone(), + input_stream.as_mut(), + &mut receiver, + ) + .await + { + self.consumer.error(true, e, None); + break; + }; if let Some(end_version) = self.config.end_version && end_version <= new_version @@ -1136,7 +1279,7 @@ impl DeltaTableInputEndpointInner { break; } } - Ok(Some(_bytes)) => { + Some(_bytes) => { info!( "delta_table {}: reached table version {new_version}, which is greater than the 'end_version' {} specified in connector config: stopping the connector", &self.endpoint_name, @@ -1151,31 +1294,11 @@ impl DeltaTableInputEndpointInner { self.queue.push_with_aux( (None, Vec::new()), Utc::now(), - Some(DeltaResumeInfo::eoi()), + QueueEntry::ResumeInfo(Some(DeltaResumeInfo::eoi())), ); break; } - Err(e) => { - // Transient timeouts are common when reading the next log entry from S3. - retry_count += 1; - - if retry_count == 20 { - self.consumer.error( - true, - anyhow!("error reading the next log entry after {retry_count} attempts (current table version: {version}): {e}"), - Some("delta-next-log") - ); - break; - } else { - let backoff_delay = calculate_backoff_delay(retry_count - 1); - warn!( - "delta_table {}: error reading the next log entry after {retry_count} attempts (current table version: {version}): {e}; retrying in {:?}", - &self.endpoint_name, backoff_delay - ); - sleep(backoff_delay).await; - } - } } } } else { @@ -1209,6 +1332,7 @@ impl DeltaTableInputEndpointInner { let delta_table: DeltaTable = { let mut retry_count = 0; + // We don't use config.max_retries here. Do we want unlimited retries opening the table? const MAX_RETRIES: u32 = 10; // We've seen the table builder get stuck forever in S3 authentication for some configurations @@ -1338,6 +1462,8 @@ impl DeltaTableInputEndpointInner { if !self.config.snapshot() { *self.last_resume_status.lock().unwrap() = Some(DeltaResumeInfo::follow_mode(version, false)); + *self.last_checkpointable_status.lock().unwrap() = + DeltaResumeInfo::follow_mode(version, false); } // Register object store with datafusion, so it will recognize individual parquet @@ -1582,13 +1708,17 @@ impl DeltaTableInputEndpointInner { /// Execute a SQL query to load a complete or partial snapshot of the DeltaTable. /// Returns the total number of records processed. + /// + /// Fails with an error if the function fails to read the snapshot after retrying + /// `num_retries` times. async fn execute_snapshot_query( &self, query: &str, descr: &str, input_stream: &mut dyn ArrowStream, receiver: &mut Receiver, - ) -> usize { + num_retries: u32, + ) -> Result { let descr = format!("{descr} query '{query}'"); debug!( "delta_table {}: retrieving data from the Delta table snapshot using {descr}", @@ -1602,9 +1732,7 @@ impl DeltaTableInputEndpointInner { let df = match self.datafusion.sql_with_options(query, options).await { Ok(df) => df, Err(e) => { - self.consumer - .error(true, anyhow!("error compiling query '{query}': {e}"), None); - return 0; + return Err(anyhow!("error compiling query '{query}': {e}")); } }; @@ -1616,6 +1744,8 @@ impl DeltaTableInputEndpointInner { input_stream, receiver, self.allocate_snapshot_transaction_label(), + num_retries, + None, ) .await } @@ -1633,7 +1763,19 @@ impl DeltaTableInputEndpointInner { /// /// * `transaction` - execute the dataframe as part of a transaction with the given label (is `Some`). /// + /// * `max_retries` - the maximum number of retries to attempt if the function fails to read the log entry. + /// /// Returns the total number of records processed. + /// + /// Returns an error if the function fails to read the log entry after performing the configured + /// number of retries. Note that errors parsing table records are not reported here; they are + /// reported by calling `consumer.error`. + /// + /// On error, the function commits the current transaction if any. It is possible that some of the + /// records have been processed and pushed to the circuit before the error. + /// + /// If `max_retries` is >0, the function can push duplicate inputs to the circuit as part of the + /// retry loop. #[allow(clippy::too_many_arguments)] async fn execute_df( &self, @@ -1644,21 +1786,89 @@ impl DeltaTableInputEndpointInner { input_stream: &mut dyn ArrowStream, receiver: &mut Receiver, transaction: Option>, - ) -> usize { + max_retries: u32, + current_table_version: Option, + ) -> Result { + let mut retry_count = 0; + loop { + match self + .execute_df_inner( + dataframe.clone(), + polarity, + cdc_delete_filter.clone(), + input_stream, + receiver, + &transaction, + ) + .await + { + Ok(total_records) => { + self.consumer + .update_connector_health(ConnectorHealth::healthy()); + return Ok(total_records); + } + Err(e) => { + retry_count += 1; + if retry_count - 1 == max_retries { + let message = format!( + "error retrieving {descr} after {retry_count} attempts{}: {e}", + if let Some(version) = current_table_version { + format!(" (current table version: {version})") + } else { + String::new() + } + ); + self.consumer + .update_connector_health(ConnectorHealth::unhealthy(&message)); + return Err(anyhow!(message)); + } + let backoff_delay = calculate_backoff_delay(retry_count - 1); + + let message = format!( + "error retrieving {descr} after {retry_count} attempts{}: {e}; retrying in {backoff_delay:?}", + if let Some(version) = current_table_version { + format!(" (current table version: {version})") + } else { + String::new() + } + ); + self.consumer + .update_connector_health(ConnectorHealth::unhealthy(&message)); + warn!("delta_table {}: {message}", &self.endpoint_name); + sleep(backoff_delay).await; + } + } + } + } + + // A single attempt of the `execute_df` retry loop. + #[allow(clippy::too_many_arguments)] + async fn execute_df_inner( + &self, + dataframe: DataFrame, + polarity: bool, + cdc_delete_filter: Option>, + input_stream: &mut dyn ArrowStream, + receiver: &mut Receiver, + transaction: &Option>, + ) -> Result { wait_running(receiver).await; + let transaction = transaction.clone(); // Limit the number of connectors simultaneously reading from Delta Lake. let _token = DELTA_READER_SEMAPHORE.acquire().await.unwrap(); let mut stream = match dataframe.execute_stream().await { Err(e) => { - self.consumer - .error(true, anyhow!("error retrieving {descr}: {e:?}"), None); - return 0; + return Err(format!("{e:?}")); } Ok(stream) => stream, }; + // We declare the connector healthy at this point. + self.consumer + .update_connector_health(ConnectorHealth::healthy()); + let mut num_batches = 0; let mut total_records = 0usize; @@ -1701,7 +1911,7 @@ impl DeltaTableInputEndpointInner { }, move |(buffer, errors, timestamp)| { queue.push_entry( - InputQueueEntry::new_with_aux(timestamp, None) + InputQueueEntry::new_with_aux(timestamp, QueueEntry::ResumeInfo(None)) .with_buffer(buffer) .with_start_transaction(transaction.clone()), errors, @@ -1717,13 +1927,18 @@ impl DeltaTableInputEndpointInner { let batch = match batch { Ok(batch) => batch, Err(e) => { - self.consumer.error( - false, - anyhow!("error retrieving batch {num_batches} of {descr}: {e:?}"), - Some("delta-batch"), + drop(job_queue); + // We don't have a way to rollback the transaction at this point. The best + // we can do is commit the transaction so it doesn't block the pipeline. + // This means that the connector will generate duplicate inputs on a retry. + self.queue.push_entry( + InputQueueEntry::new_with_aux(timestamp, QueueEntry::Rollback) + // If we started a transaction while processing the log entry, commit it now. + .with_commit_transaction(true), + Vec::new(), ); - continue; + return Err(format!("error retrieving batch {num_batches}: {e:?}")); } }; // info!("schema: {}", batch.schema()); @@ -1736,7 +1951,7 @@ impl DeltaTableInputEndpointInner { } job_queue.flush().await; - total_records + Ok(total_records) } async fn parse_record_batch( @@ -1786,6 +2001,16 @@ impl DeltaTableInputEndpointInner { /// Apply actions from a transaction log entry. /// /// Only `Add` and `Remove` actions are picked up. + /// + /// Returns an error if the connector failed to read the log entry after performing the `self.config.max_retries` + /// number of retries. Note that errors parsing table records are not reported here; they are + /// reported in the `execute_df` method by calling `consumer.error`. + /// + /// On error, the function commits the current transaction if any. It is possible that some of the + /// records have been processed and pushed to the circuit before the error. + /// + /// If `self.config.max_retries` is >0, the function can push duplicate inputs to the circuit as part of the + /// retry loop. async fn process_log_entry( &self, new_version: i64, @@ -1794,7 +2019,7 @@ impl DeltaTableInputEndpointInner { cdc_delete_filter: Option>, input_stream: &mut dyn ArrowStream, receiver: &mut Receiver, - ) { + ) -> AnyResult<()> { if self.config.verbose > 0 { // Don't log actions we ignore to limit spurious logging. E.g., delta lake // optimization passes can generate thousand of noop actions. @@ -1825,7 +2050,7 @@ impl DeltaTableInputEndpointInner { if self.config.is_cdc() { self.process_cdc_transaction(actions, table, cdc_delete_filter, input_stream, receiver) - .await; + .await?; } else { let column_names = self.used_column_list(table); @@ -1849,7 +2074,7 @@ impl DeltaTableInputEndpointInner { receiver, start_transaction.clone(), ) - .await; + .await?; } } @@ -1863,7 +2088,7 @@ impl DeltaTableInputEndpointInner { receiver, start_transaction.clone(), ) - .await; + .await?; } } } @@ -1872,15 +2097,17 @@ impl DeltaTableInputEndpointInner { self.queue.push_entry( InputQueueEntry::new_with_aux( timestamp, - Some(DeltaResumeInfo::follow_mode( + QueueEntry::ResumeInfo(Some(DeltaResumeInfo::follow_mode( new_version, self.config.end_version == Some(new_version), - )), + ))), ) // If we started a transaction while processing the log entry, commit it now. .with_commit_transaction(true), Vec::new(), ); + + Ok(()) } /// Process a DeltaLake transaction in CDC mode: @@ -1897,7 +2124,7 @@ impl DeltaTableInputEndpointInner { cdc_delete_filter: Option>, input_stream: &mut dyn ArrowStream, receiver: &mut Receiver, - ) { + ) -> AnyResult<()> { let result = self .do_process_cdc_transaction(actions, table, cdc_delete_filter, input_stream, receiver) .await; @@ -1906,9 +2133,7 @@ impl DeltaTableInputEndpointInner { // If the table does not exist, there's no harm. let _ = self.datafusion.deregister_table("tmp_table"); - if let Err(e) = result { - self.consumer.error(false, e, Some("delta-cdc")); - } + result } async fn do_process_cdc_transaction( @@ -1939,12 +2164,12 @@ impl DeltaTableInputEndpointInner { ); // Create a datafusion table backed by these files. - let table = Arc::new( + let parquet_table = Arc::new( self.create_parquet_table(table, files, &description) .await?, ); - self.datafusion.register_table("tmp_table", table).map_err(|e| { + self.datafusion.register_table("tmp_table", parquet_table).map_err(|e| { anyhow!("internal error processing {description}; {REPORT_ERROR}; error registering Parquet table: {e}") })?; @@ -1973,8 +2198,10 @@ impl DeltaTableInputEndpointInner { input_stream, receiver, self.allocate_follow_transaction_label(), + self.config.max_retries(), + table.version(), ) - .await; + .await?; Ok(()) } @@ -2013,7 +2240,7 @@ impl DeltaTableInputEndpointInner { input_stream: &mut dyn ArrowStream, receiver: &mut Receiver, start_transaction: Option>, - ) { + ) -> AnyResult<()> { let result = match action { Action::Add(add) if add.data_change => { self.add_with_polarity( @@ -2041,16 +2268,14 @@ impl DeltaTableInputEndpointInner { ) .await } - _ => return, + _ => return Ok(()), }; // Deregister the table registered by `add_with_polarity`. // If the table does not exist, there's no harm. let _ = self.datafusion.deregister_table("tmp_table"); - if let Err(e) = result { - self.consumer.error(false, e, Some("delta-action")); - } + result } // TODO: here, as well as in `process_cdc_transaction`, we can get some potential speedup by only reading a subset @@ -2074,12 +2299,12 @@ impl DeltaTableInputEndpointInner { let full_path = format!("{}{}", table.log_store().object_store_url().as_str(), path); // Create a datafusion table backed by these files. - let table = Arc::new( + let parquet_table = Arc::new( self.create_parquet_table(table, vec![full_path.clone()], &description) .await?, ); - self.datafusion.register_table("tmp_table", table).map_err(|e| { + self.datafusion.register_table("tmp_table", parquet_table).map_err(|e| { anyhow!("internal error processing file {full_path}; {REPORT_ERROR}; error registering Parquet table: {e}") })?; @@ -2104,8 +2329,10 @@ impl DeltaTableInputEndpointInner { input_stream, receiver, start_transaction, + self.config.max_retries(), + table.version(), ) - .await; + .await?; Ok(()) } diff --git a/crates/adapters/src/integrated/delta_table/test.rs b/crates/adapters/src/integrated/delta_table/test.rs index b221b7def1a..ef23136429a 100644 --- a/crates/adapters/src/integrated/delta_table/test.rs +++ b/crates/adapters/src/integrated/delta_table/test.rs @@ -45,8 +45,8 @@ use std::fs::File; use std::io::Write; use std::mem::forget; use std::os::unix::ffi::OsStrExt; -use std::path::Path; -use std::sync::Arc; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use tempfile::{NamedTempFile, TempDir}; use tokio::sync::mpsc; @@ -108,6 +108,7 @@ async fn wait_for_output_records( expected_output: &[T], datafusion: &SessionContext, timeout_ms: u64, + dedup: bool, ) where T: for<'a> DeserializeWithContext<'a, SqlSerdeConfig, Variant> + DBData, { @@ -144,8 +145,12 @@ async fn wait_for_output_records( result.len() ); + result.sort(); + if dedup { + result.dedup(); + } + if result.len() == expected_output.len() { - result.sort(); let mut expected_output = expected_output.to_vec(); expected_output.sort(); assert_eq!(result, expected_output); @@ -713,6 +718,8 @@ async fn test_follow( test_end_version: bool, buffer_size: u64, buffer_timeout_ms: u64, + inject_failure: Option>, + clear_failure: Option>, ) { async fn suspend_pipeline(pipeline: Controller) { println!("start suspend"); @@ -897,6 +904,7 @@ async fn test_follow( &expected_output, &datafusion, 20_000, + false, ) .await; @@ -948,6 +956,58 @@ async fn test_follow( .collect::>(); }; + // Run after the write so the test process can still update the table; the pipeline + // then fails to read the new snapshot until permissions are restored. + if let Some(inject_failure) = &inject_failure { + inject_failure(); + } + + if inject_failure.is_some() { + wait( + || { + pipeline + .input_endpoint_status("test_input1") + .ok() + .and_then(|s| s.health) + .is_some_and(|h| { + let unhealthy = matches!( + h.status, + feldera_types::adapter_stats::ConnectorHealthStatus::Unhealthy + ); + if unhealthy { + println!("unhealthy: {:?}", h); + } + unhealthy + }) + }, + 20_000, + ) + .expect("timeout waiting for input connector health to become unhealthy"); + } + + if let Some(clear_failure) = &clear_failure { + clear_failure(); + } + + if clear_failure.is_some() { + wait( + || { + pipeline + .input_endpoint_status("test_input1") + .ok() + .and_then(|s| s.health) + .is_some_and(|h| { + matches!( + h.status, + feldera_types::adapter_stats::ConnectorHealthStatus::Healthy + ) + }) + }, + 20_000, + ) + .expect("timeout waiting for input connector health to become healthy"); + } + if suspend { suspend_pipeline(pipeline).await; @@ -995,6 +1055,7 @@ async fn test_follow( &expected_output, &datafusion, if suspend { 200_000 } else { 0 }, + inject_failure.is_some(), ) .await; } @@ -1319,6 +1380,48 @@ fn delta_data(max_records: usize) -> impl Strategy> }) } +/// Remove owner read and execute on the delta table **root directory only**, and push that path +/// and its original mode onto `saved` for [`restore_delta_input_table_read_permission`]. +/// +/// Without `r` and `x` on the root, the process cannot traverse into `_delta_log` or data paths +/// even if inner files still have permissive modes. +#[cfg(unix)] +fn strip_delta_input_table_read_permission( + table_root: &Path, + saved: &mut Vec<(PathBuf, u32)>, +) -> std::io::Result<()> { + use std::fs; + use std::os::unix::fs::PermissionsExt; + + let meta = fs::metadata(table_root)?; + if !meta.is_dir() { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "delta input table path must be a directory", + )); + } + + let mode = meta.permissions().mode(); + saved.push((table_root.to_path_buf(), mode)); + + let new_mode = mode & !0o500; + let mut perms = meta.permissions(); + perms.set_mode(new_mode); + fs::set_permissions(table_root, perms)?; + Ok(()) +} + +#[cfg(unix)] +fn restore_delta_input_table_read_permission(saved: Vec<(PathBuf, u32)>) -> std::io::Result<()> { + use std::fs; + use std::os::unix::fs::PermissionsExt; + + for (path, mode) in saved.into_iter().rev() { + fs::set_permissions(&path, fs::Permissions::from_mode(mode))?; + } + Ok(()) +} + async fn delta_table_follow_file_test_common( snapshot: bool, transaction_mode: DeltaTableTransactionMode, @@ -1338,6 +1441,45 @@ async fn delta_table_follow_file_test_common( let output_table_dir: TempDir = TempDir::new().unwrap(); let output_table_uri = output_table_dir.path().display().to_string(); + // With `end_version`, the connector stops tailing the log before new versions appear, so + // stripping read permission would not drive the connector unhealthy (wait would time out). + #[cfg(unix)] + let (inject_failure, clear_failure): (Option>, Option>) = + if end_version { + (None, None) + } else { + let saved_modes: Arc>> = Arc::new(Mutex::new(Vec::new())); + let input_root = input_table_dir.path().to_path_buf(); + + let inject_failure: Box = { + let saved_modes = Arc::clone(&saved_modes); + let input_root = input_root.clone(); + Box::new(move || { + let mut guard = saved_modes.lock().unwrap(); + guard.clear(); + strip_delta_input_table_read_permission(&input_root, &mut *guard) + .unwrap_or_else(|e| { + panic!("inject_failure (strip read permission on input table): {e}") + }); + }) + }; + + let clear_failure: Box = { + let saved_modes = Arc::clone(&saved_modes); + Box::new(move || { + let entries = std::mem::take(&mut *saved_modes.lock().unwrap()); + restore_delta_input_table_read_permission(entries).unwrap_or_else(|e| { + panic!("clear_failure (restore read permission on input table): {e}") + }); + }) + }; + + (Some(inject_failure), Some(clear_failure)) + }; + + #[cfg(not(unix))] + let (inject_failure, clear_failure) = (None, None); + test_follow( &relation_schema, &input_table_uri, @@ -1350,6 +1492,8 @@ async fn delta_table_follow_file_test_common( end_version, 1000, 100, + inject_failure, + clear_failure, ) .await; } @@ -1565,6 +1709,8 @@ async fn delta_table_follow_s3_test_common(snapshot: bool, suspend: bool) { false, 1000, 100, + None, + None, ) .await; } diff --git a/crates/adapters/src/test/mock_input_consumer.rs b/crates/adapters/src/test/mock_input_consumer.rs index 3bf228bcc20..ea0061314f9 100644 --- a/crates/adapters/src/test/mock_input_consumer.rs +++ b/crates/adapters/src/test/mock_input_consumer.rs @@ -6,6 +6,7 @@ use dbsp::operator::StagedBuffers; use feldera_adapterlib::ConnectorMetadata; use feldera_adapterlib::format::BufferSize; use feldera_adapterlib::transport::{Resume, Watermark}; +use feldera_types::adapter_stats::ConnectorHealth; use feldera_types::config::FtModel; use std::sync::{Arc, Mutex, MutexGuard}; @@ -129,6 +130,8 @@ impl InputConsumer for MockInputConsumer { fn commit_transaction(&self) { self.state().transaction_in_progress = false; } + + fn update_connector_health(&self, _health: ConnectorHealth) {} } pub struct MockInputParserState { diff --git a/crates/adapters/src/transport/kafka/ft/test.rs b/crates/adapters/src/transport/kafka/ft/test.rs index 5ff6bf58e06..79ea0c38559 100644 --- a/crates/adapters/src/transport/kafka/ft/test.rs +++ b/crates/adapters/src/transport/kafka/ft/test.rs @@ -25,6 +25,7 @@ use feldera_adapterlib::format::{BufferSize, flatten_nested}; use feldera_adapterlib::transport::{Resume, Watermark}; use feldera_macros::IsNone; use feldera_sqllib::{ByteArray, SqlString, Variant}; +use feldera_types::adapter_stats::ConnectorHealth; use feldera_types::config::{ ConnectorConfig, FormatConfig, FtModel, InputEndpointConfig, OutputBufferConfig, TransportConfig, default_max_queued_records, @@ -763,6 +764,8 @@ impl InputConsumer for DummyInputConsumer { fn start_transaction(&self, _label: Option<&str>) {} fn commit_transaction(&self) {} + + fn update_connector_health(&self, _health: ConnectorHealth) {} } #[test] diff --git a/crates/feldera-types/src/transport/delta_table.rs b/crates/feldera-types/src/transport/delta_table.rs index 51abdc7aee7..d827e858c48 100644 --- a/crates/feldera-types/src/transport/delta_table.rs +++ b/crates/feldera-types/src/transport/delta_table.rs @@ -354,6 +354,20 @@ pub struct DeltaTableReaderConfig { #[serde(default)] pub verbose: u32, + /// Maximum number of retries for failed object store operations. + /// + /// Controls how many times the connector retries high-level storage operations, + /// such as reading a Delta log entry or a Parquet file. + /// + /// This is in addition to lower-level retries (e.g., individual S3 operation retries governed + /// by storage options like `retry_timeout`). If those retries are exhausted + /// or the failure is otherwise unrecoverable at the storage layer, the + /// connector retries the entire operation. + /// + /// Defaults to unlimited retries. Set to 0 to disable retries. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_retries: Option, + /// Storage options for configuring backend object store. /// /// For specific options available for different storage backends, see: @@ -364,6 +378,12 @@ pub struct DeltaTableReaderConfig { pub object_store_config: HashMap, } +impl DeltaTableReaderConfig { + pub fn max_retries(&self) -> u32 { + self.max_retries.unwrap_or(u32::MAX) + } +} + #[cfg(test)] #[test] fn test_delta_reader_config_serde() { diff --git a/docs.feldera.com/docs/changelog.md b/docs.feldera.com/docs/changelog.md index 047667c6266..3b034515cfe 100644 --- a/docs.feldera.com/docs/changelog.md +++ b/docs.feldera.com/docs/changelog.md @@ -12,7 +12,20 @@ import TabItem from '@theme/TabItem'; - ## Unreleased + ## v0.288.0 + + Delta Lake input connector error handling behavior change: + + In the past if the connector wasn't able to read a table version, it + signaled an error and moved to the next version. This could cause data loss. + With this change the connector will either retry forever or fail and stop + producing input after exhausting retry attempts. + + The second behavioral change is that the connector can now produce + duplicate inputs even without a pipeline restart as the connector retries + processing delta log entries. + + ## v0.281.0 Starting a pipeline while storage is still clearing (`storage_status=Clearing`) now returns `CannotStartWhileClearingStorage` instead of succeeding. Clearing storage while a start diff --git a/docs.feldera.com/docs/connectors/sources/delta.md b/docs.feldera.com/docs/connectors/sources/delta.md index 77f8dd5100b..31d179f2e46 100644 --- a/docs.feldera.com/docs/connectors/sources/delta.md +++ b/docs.feldera.com/docs/connectors/sources/delta.md @@ -36,6 +36,7 @@ exactly once fault tolerance. | `cdc_delete_filer` | string | |

A predicate that determines whether the record represents a deletion.

This setting is only valid in the `cdc` mode. It specifies a predicate applied to each row in the Delta table to determine whether the row represents a deletion event. Its value must be a valid Boolean SQL expression that can be used in a query of the form `SELECT * from WHERE `.

| | `cdc_order_by` | string | |

An expression that determines the ordering of updates in the Delta table.

This setting is only valid in the `cdc` mode. It specifies a predicate applied to each row in the Delta table to determine the order in which updates in the table should be applied. Its value must be a valid SQL expression that can be used in a query of the form `SELECT * from

ORDER BY `.

| | `num_parsers` | string | | The number of parallel parsing tasks the connector uses to process data read from the table. Increasing this value can enhance performance by allowing more concurrent processing. Recommended range: 1–10. The default is 4.| +| `max_retries` | integer| unlimited retries|

Maximum number of retries for failed object store operations.

Controls how many times the connector retries high-level storage operations, such as reading a Delta log entry or a Parquet file.

This is in addition to lower-level retries (e.g., individual S3 operation retries governed by storage options like `retry_timeout`). If those retries are exhausted or the failure is otherwise unrecoverable at the storage layer, the connector retries the entire operation.

Defaults to unlimited retries. Set to 0 to disable retries.

See [retries and at-least-once delivery](#retries-and-at-least-once-delivery)

| | `skip_unused_columns` (DEPRECATED) | bool | false |

This property is deprecated. Use the [table-level `skip_unused_columns` property](/sql/grammar#ignoring-unused-columns) instead.

Don't read unused columns from the Delta table. When set to `true`, this option instructs the connector to avoid reading columns from the Delta table that are not used in any view definitions. To be skipped, the columns must be either nullable or have default values. This can improve ingestion performance, especially for wide tables.

Note: The simplest way to exclude unused columns is to omit them from the Feldera SQL table declaration. The connector never reads columns that aren't declared in the SQL schema. Additionally, the SQL compiler emits warnings for declared but unused columns—use these as a guide to optimize your schema.

| | `max_concurrent_readers` | integer| 6 |

Maximum number of concurrent object store reads performed by all Delta Lake connectors.

This setting is used to limit the number of concurrent reads of the object store in a pipeline with a large number of Delta Lake connectors. When multiple connectors are simultaneously reading from the object store, this can lead to transport timeouts.

When enabled, this setting limits the number of concurrent reads across all connectors. This is a global setting that affects all Delta Lake connectors, and not just the connector where it is specified. It should therefore be used at most once in a pipeline. If multiple connectors specify this setting, they must all use the same value.

The default value is 6.

| @@ -167,6 +168,8 @@ Additional configuration options to configure HTTP client for remote object stor | `proxy_excludes` | List of hosts that bypass proxy. | | `randomize_addresses` | Randomize order addresses that the DNS resolution yields. This will spread the connections across more servers. | | `timeout` | Request timeout. The timeout is applied from when the request starts connecting until the response body has finished. Format: ``, e.g., `30s`, `1.5m`.| +| `retry_timeout` | The maximum length of time from the initial request after which no further retries will be attempted. This not only bounds the length of time before a server error will be surfaced to the application, but also bounds the length of time a request’s credentials must remain valid. As requests are retried without renewing credentials or regenerating request payloads, this number should be kept below 5 minutes to avoid errors due to expired credentials and/or request payloads| +| `connect_timeout` | Set a timeout for only the connect phase of a client. This is the time allowed for the client to establish a connection and if the connection is not established within this time, the client returns a timeout error.| | `user_agent` | User-Agent header to be used by this client. | ## Data type mapping @@ -303,6 +306,39 @@ CREATE TABLE transaction( ]'); ``` +## Retries and at-least-once delivery + +When interacting with an object store such as Amazon S3, the Delta Lake connector must handle +transient failures, including timeouts and expired authentication tokens. + +These errors are first handled at the level of individual object store operations, which are +automatically retried when possible. This behavior is controlled by the +[HTTP client configuration](#http-client-configuration) settings: `connect_timeout`, +`timeout`, and `retry_timeout`. + +If these lower-level retries are exhausted—or if the error cannot be recovered at the storage +layer—the connector retries the entire operation (for example, re-reading a Delta log entry). +This behavior is controlled by the `max_retries` setting: + +* By default, the connector performs unbounded retries. +* Set `max_retries = N` to limit the number of attempts. +* Set `max_retries = 0` to disable retries entirely. + +If the connector cannot recover after `N` attempts, it fails with a fatal error and stops +ingesting inputs. + +Because retries may occur after partial progress (e.g., after partially processing a Delta log entry), +the same data may be ingested more than once. This is consistent with the connector’s **at-least-once delivery** +guarantee. + +To ensure idempotent ingestion, we recommend defining [primary keys](/connectors/unique_keys). + +Retry activity is reflected in the connector’s [health status](https://docs.feldera.com/api/get-input-status/): +it is marked **UNHEALTHY** while retrying failed operations. + +If the pipeline is stopped and restarted during a retry, the connector resumes from the last successfully +ingested table version. This guarantees that no data loss occurs due to object store read errors. + ## Additional examples ### Example: Setting `timestamp_column` diff --git a/openapi.json b/openapi.json index daf2d49cb75..ab9ee540746 100644 --- a/openapi.json +++ b/openapi.json @@ -8070,6 +8070,13 @@ "nullable": true, "minimum": 0 }, + "max_retries": { + "type": "integer", + "format": "int32", + "description": "Maximum number of retries for failed object store operations.\n\nControls how many times the connector retries high-level storage operations,\nsuch as reading a Delta log entry or a Parquet file.\n\nThis is in addition to lower-level retries (e.g., individual S3 operation retries governed\nby storage options like `retry_timeout`). If those retries are exhausted\nor the failure is otherwise unrecoverable at the storage layer, the\nconnector retries the entire operation.\n\nDefaults to unlimited retries. Set to 0 to disable retries.", + "nullable": true, + "minimum": 0 + }, "mode": { "$ref": "#/components/schemas/DeltaTableIngestMode" },