feldera/python/tests/workloads/test_kafka_avro.py at issue5811 · feldera/feldera

387 lines (321 loc) · 12 KB
import unittest
from feldera import PipelineBuilder
from feldera.testutils import unique_pipeline_name
from tests import TEST_CLIENT
import time
from confluent_kafka.admin import AdminClient, NewTopic
import requests
import json
import socket
import uuid
_KAFKA_BOOTSTRAP = None
_SCHEMA_REGISTRY = None
_KAFKA_ADMIN = None
# Set these before running the test:
# Example(terminal/shell):
#   export KAFKA_BOOTSTRAP_SERVERS= localhost:9092
#   export SCHEMA_REGISTRY_URL= http://localhost:8081
def env(name: str, default: str, check_http: bool = False) -> str:
    """Get environment variables used to configure the Kafka broker or Schema Registry endpoint.
    If the environment variables are not set, the default values are used; intended for internal development only.
    External users are expected to explicitly configure these variables."""
    value = os.getenv(name, default)
    if value == default:
        try:
            if check_http:
                # Check if Schema Registry is available
                requests.get(value, timeout=2).raise_for_status()
            else:
                # Check if Kafka broker is available
                if "://" in value:
                    # Remove protocol prefix if present (e.g., "kafka://host:port")
                    value = value.split("://", 1)[1]
                host, port = value.split(":")
                with socket.create_connection((host, int(port)), timeout=2):
                    pass  # just testing connectivity
        except Exception as e:
            raise RuntimeError(
                f"{name} is set to default '{default}', but cannot connect to it! ({e})"
    return value
"""Kafka bootstrap, schema registry, and Kafka admin client are lazy-initialized to avoid triggering live
HTTP/socket connections when importing this file. Connections are only created when the test calls the
respective getter functions at runtime."""
def get_kafka_bootstrap() -> str:
    global _KAFKA_BOOTSTRAP
    if _KAFKA_BOOTSTRAP is None:
        _KAFKA_BOOTSTRAP = env(
            "KAFKA_BOOTSTRAP_SERVERS", "ci-kafka-bootstrap.korat-vibes.ts.net:9094"
    return _KAFKA_BOOTSTRAP
def get_schema_registry() -> str:
    global _SCHEMA_REGISTRY
    if _SCHEMA_REGISTRY is None:
        _SCHEMA_REGISTRY = env(
            "SCHEMA_REGISTRY_URL",
            "http://ci-schema-registry.korat-vibes.ts.net",
            check_http=True,
    return _SCHEMA_REGISTRY
def get_kafka_admin() -> AdminClient:
    global _KAFKA_ADMIN
    if _KAFKA_ADMIN is None:
        _KAFKA_ADMIN = AdminClient({"bootstrap.servers": get_kafka_bootstrap()})
    return _KAFKA_ADMIN
def extract_kafka_schema_artifacts(sql: str) -> tuple[list[str], list[str]]:
    """Extract Kafka topic and schema subjects from the SQL query"""
    topics = re.findall(r'"topic"\s*:\s*"([^"]+)"', sql)
    subjects = re.findall(r"create view\s+(\w+)", sql, re.I) + re.findall(
        r"create index\s+(\w+)", sql, re.I
    return list(set(topics)), list(set(subjects))
def delete_kafka_topics(admin: AdminClient, topics: list[str]):
    tpcs = admin.delete_topics(topics)
    for topic, tpcs in tpcs.items():
        try:
            tpcs.result()
            print(f"Deleted topic: {topic}")
        except Exception as e:
            print(f"Failed to delete {topic}: {e}")
def delete_schema_subjects(registry_url: str, subjects: list[str]):
    for subject in subjects:
        r = requests.delete(f"{registry_url}/subjects/{subject}")
        print(
            f"Deleted schema subject: {subject}"
            if r.status_code == 200
            else f"Failed to delete {subject}: {r.status_code} {r.text}"
def cleanup_kafka_schema_artifacts(sql: str, admin: AdminClient, registry_url: str):
    """Clean up Kafka topics and Schema Subjects after each test run.
    Each run produces new records. So, rerunning without cleanup will append data to the same topic(s)."""
    topics, subjects = extract_kafka_schema_artifacts(sql)
    delete_kafka_topics(admin, topics)
    delete_schema_subjects(registry_url, subjects)
def create_kafka_topic(
    topic_name: str, num_partitions: int = 1, replication_factor: int = 1
    """Create new topics when multiple partitions are required, since the Kafka output connector does not support
    specifying the number of partitions during topic creation."""
    new_topic = NewTopic(
        topic_name, num_partitions=num_partitions, replication_factor=replication_factor
    futures = get_kafka_admin().create_topics([new_topic])
    for t, f in futures.items():
        try:
            f.result()
            print(f"Topic {t} created with {num_partitions} partitions")
        except Exception as e:
            if "already exists" in str(e):
                print(f"Topic {t} already exists")
            else:
                raise
class Variant:
    """Represents a pipeline variant whose tables and views share the same SQL but differ in connector configuration.
    Each variant generates unique topic, table, and view names based on the provided configuration."""
    def __init__(self, cfg):
        self.id = cfg["id"]
        self.limit = cfg["limit"]
        self.partitions = cfg.get("partitions")
        self.sync = cfg.get("sync")
        self.start_from = cfg.get("start_from")
        self.create_topic = cfg.get("create_topic", False)
        self.num_partitions = cfg.get("num_partitions", 1)
        suffix = uuid.uuid4().hex[:8]
        self.topic1 = f"my_topic_avro{suffix}"
        self.topic2 = f"my_topic_avro2{suffix}"
        self.source = f"t_{self.id}"
        self.view = f"v_{self.id}"
        self.loopback = f"loopback_{self.id}"
def sql_source_table(v: Variant) -> str:
    return f"""
create table {v.source} (
    id int,
    str varchar,
    dec decimal,
    reall real,
    dbl double,
    booll boolean,
    tmestmp timestamp,
    datee date,
    tme time
  'materialized' = 'true',
  'connectors' = '[{{
    "transport": {{
      "name": "datagen",
      "config": {{ "plan": [{{"limit": {v.limit}}}], "seed": 1 }}
def sql_view(v: Variant) -> str:
    return f"""
create view {v.view}
  'connectors' = '[{{
    "transport": {{
      "name": "kafka_output",
      "config": {{
        "bootstrap.servers": "{get_kafka_bootstrap()}",
        "topic": "{v.topic1}"
    "format": {{
      "name": "avro",
      "config": {{
        "update_format": "raw",
        "registry_urls": ["{get_schema_registry()}"]
    "index": "idx_{v.id}",
    "transport": {{
      "name": "kafka_output",
      "config": {{
        "bootstrap.servers": "{get_kafka_bootstrap()}",
        "topic": "{v.topic2}"
    "format": {{
      "name": "avro",
      "config": {{
        "update_format": "raw",
        "registry_urls": ["{get_schema_registry()}"]
as select * from {v.source};
create index idx_{v.id} on {v.view}(id);
def sql_loopback_table(v: Variant) -> str:
    # Optional configurations that will use connector defaults if not specified
    config = {
        "bootstrap.servers": get_kafka_bootstrap(),
        "topic": v.topic2,
    if v.start_from:
        config["start_from"] = v.start_from
    if v.partitions:
        config["partitions"] = v.partitions
    if v.sync:
        config["synchronize_partitions"] = v.sync
    # Convert to SQL config string
    config_json = json.dumps(config)
    return f"""
create table {v.loopback} (
    id int,
    str varchar,
    dec decimal,
    reall real,
    dbl double,
    booll boolean,
    tmestmp timestamp,
    datee date,
    tme time
  'materialized' = 'true',
  'connectors' = '[{{
    "transport": {{
      "name": "kafka_input",
       "config": {config_json}
    "format": {{
      "name": "avro",
      "config": {{
        "update_format": "raw",
        "registry_urls": ["{get_schema_registry()}"]
def build_sql(v: Variant) -> str:
    """Generate SQL for the pipeline by combining all tables and view for each variant"""
    return "\n".join([sql_source_table(v), sql_view(v), sql_loopback_table(v)])
def wait_for_rows(pipeline, expected_rows, timeout_s=1800, poll_interval_s=5):
    """Since records aren't processed instantaneously, wait until all rows are processed to validate completion by
    polling `total_completed_records` every `poll_interval_s` seconds until it reaches `expected_records`"""
    start = time.perf_counter()
    while True:
        stats = TEST_CLIENT.get_pipeline_stats(pipeline.name)
        completed = stats["global_metrics"]["total_completed_records"]
        print(f"Processed {completed}/{expected_rows} rows so far...")
        if completed >= expected_rows:
            return completed
        # Prevent infinite polling
        if time.perf_counter() - start > timeout_s:
            raise AssertionError(
                f"Timeout: only {completed}/{expected_rows} rows processed"
        time.sleep(poll_interval_s)
def validate_loopback(pipeline, variant: Variant):
    """Validation: once finished, the loopback table should contain all generated values
    Validate by comparing the hash of the source table 't' and loopback table"""
    src_tbl_hash = pipeline.query_hash(
        f"SELECT * FROM {variant.source} ORDER BY id, str"
    loopback_tbl_hash = pipeline.query_hash(
        f"SELECT * FROM {variant.loopback} ORDER BY id, str"
    assert src_tbl_hash == loopback_tbl_hash, (
        f"Loopback table hash mismatch for variant {variant.id}!\n"
        f"Source table: {variant.source}\n"
        f"Loopback table: {variant.loopback}\n"
        f"Expected hash: {src_tbl_hash}\n"
        f"Got hash: {loopback_tbl_hash}"
    print(f"Loopback table validated successfully for variant {variant.id}")
def create_and_run_pipeline_variant(cfg):
    """Create and run multiple pipelines based on configurations defined for each pipeline variant"""
    v = Variant(cfg)
    # Pre-create topics if specified
    if v.create_topic:
        create_kafka_topic(v.topic1, v.num_partitions)
        create_kafka_topic(v.topic2, v.num_partitions)
    sql = build_sql(v)
    pipeline_name = unique_pipeline_name(f"test_kafka_avro_{v.id}")
    pipeline = PipelineBuilder(TEST_CLIENT, pipeline_name, sql).create_or_replace()
        pipeline.start()
        # NOTE => total_completed_records counts all rows that are processed through each output as follows:
        # 1. Written by the view<v> -> Kafka
        # 2. Ingested into loopback table from Kafka
        # Thus, expected_records = generated_rows * number_of_outputs (in this case 2)
        expected_rows = v.limit * 2
        wait_for_rows(pipeline, expected_rows)
        validate_loopback(pipeline, v)
    finally:
        pipeline.stop(force=True)
        cleanup_kafka_schema_artifacts(sql, get_kafka_admin(), get_schema_registry())
class TestKafkaAvro(unittest.TestCase):
    """Each test method uses its own SQL snippet and processes only its own variant."""
    TEST_CONFIGS = [
        {"id": 0, "limit": 10, "partitions": [0], "sync": False},
            "id": 1,
            "limit": 1000000,
            "partitions": [0, 1, 2],
            "sync": False,
            "create_topic": True,
            "num_partitions": 3,  # pre-create topic with 3 partitions
    def test_kafka_avro_variants(self):
        # If a run ID is specified, only the test with the specified run ID is ran
        run_id = os.getenv("RUN_ID")
        configs_to_run = (
            [cfg for cfg in self.TEST_CONFIGS if cfg["id"] == int(run_id)]
            if run_id is not None
            else self.TEST_CONFIGS
        for cfg in configs_to_run:
            print(f"\n Running pipeline variant id = {cfg['id']}")
            create_and_run_pipeline_variant(cfg)
# To run all pipelines in this test file:
#   python -m pytest ./tests/workloads/test_kafka_avro.py
# To run a specific pipeline variant by its ID:
#   RUN_ID=0 python -m pytest ./tests/workloads/test_kafka_avro.py
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

test_kafka_avro.py

Latest commit

History

test_kafka_avro.py

File metadata and controls