feast/sdk/python/feast/driver_test_data.py at feast_usage · tsotnet/feast

198 lines (172 loc) · 8.83 KB
# This module generates dummy data to be used for tests and examples.
from enum import Enum
import numpy as np
import pandas as pd
from pytz import FixedOffset, timezone, utc
from feast.infra.provider import DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
class EventTimestampType(Enum):
    TZ_NAIVE = 0
    TZ_AWARE_UTC = 1
    TZ_AWARE_FIXED_OFFSET = 2
    TZ_AWARE_US_PACIFIC = 3
def _convert_event_timestamp(event_timestamp: pd.Timestamp, t: EventTimestampType):
    if t == EventTimestampType.TZ_NAIVE:
        return event_timestamp
    elif t == EventTimestampType.TZ_AWARE_UTC:
        return event_timestamp.replace(tzinfo=utc)
    elif t == EventTimestampType.TZ_AWARE_FIXED_OFFSET:
        return event_timestamp.replace(tzinfo=utc).astimezone(FixedOffset(60))
    elif t == EventTimestampType.TZ_AWARE_US_PACIFIC:
        return event_timestamp.replace(tzinfo=utc).astimezone(timezone("US/Pacific"))
def create_orders_df(
    customers,
    drivers,
    start_date,
    end_date,
    order_count,
    infer_event_timestamp_col=False,
) -> pd.DataFrame:
    Example df generated by this function:
    | order_id | driver_id | customer_id | order_is_success |    event_timestamp  |
    +----------+-----------+-------------+------------------+---------------------+
    df = pd.DataFrame()
    df["order_id"] = [order_id for order_id in range(100, 100 + order_count)]
    df["driver_id"] = np.random.choice(drivers, order_count)
    df["customer_id"] = np.random.choice(customers, order_count)
    df["order_is_success"] = np.random.randint(0, 2, size=order_count).astype(np.int32)
    if infer_event_timestamp_col:
        df["e_ts"] = [
            _convert_event_timestamp(
                pd.Timestamp(dt, unit="ms", tz="UTC").round("ms"),
                EventTimestampType(3),
            for idx, dt in enumerate(
                pd.date_range(start=start_date, end=end_date, periods=order_count)
        df.sort_values(
            by=["e_ts", "order_id", "driver_id", "customer_id"], inplace=True,
        df[DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL] = [
            _convert_event_timestamp(
                pd.Timestamp(dt, unit="ms", tz="UTC").round("ms"),
                EventTimestampType(idx % 4),
            for idx, dt in enumerate(
                pd.date_range(start=start_date, end=end_date, periods=order_count)
        df.sort_values(
            by=[
                DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL,
                "order_id",
                "driver_id",
                "customer_id",
            inplace=True,
    return df
def create_driver_hourly_stats_df(drivers, start_date, end_date) -> pd.DataFrame:
    Example df generated by this function:
    | datetime         | driver_id | conv_rate | acc_rate | avg_daily_trips | created          |
    |------------------+-----------+-----------+----------+-----------------+------------------|
    df_hourly = pd.DataFrame(
            "datetime": [
                pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
                for dt in pd.date_range(
                    start=start_date, end=end_date, freq="1H", closed="left"
            # include a fixed timestamp for get_historical_features in the quickstart
                pd.Timestamp(
                    year=2021, month=4, day=12, hour=7, minute=0, second=0, tz="UTC"
    df_all_drivers = pd.DataFrame()
    for driver in drivers:
        df_hourly_copy = df_hourly.copy()
        df_hourly_copy["driver_id"] = driver
        df_all_drivers = pd.concat([df_hourly_copy, df_all_drivers])
    df_all_drivers.reset_index(drop=True, inplace=True)
    rows = df_all_drivers["datetime"].count()
    df_all_drivers["conv_rate"] = np.random.random(size=rows).astype(np.float32)
    df_all_drivers["acc_rate"] = np.random.random(size=rows).astype(np.float32)
    df_all_drivers["avg_daily_trips"] = np.random.randint(0, 1000, size=rows).astype(
        np.int32
    df_all_drivers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms"))
    # Create duplicate rows that should be filtered by created timestamp
    # TODO: These duplicate rows area indirectly being filtered out by the point in time join already. We need to
    #  inject a bad row at a timestamp where we know it will get joined to the entity dataframe, and then test that
    #  we are actually filtering it with the created timestamp
    late_row = df_all_drivers.iloc[int(rows / 2)]
    df_all_drivers = df_all_drivers.append(late_row).append(late_row)
    return df_all_drivers
def create_customer_daily_profile_df(customers, start_date, end_date) -> pd.DataFrame:
    Example df generated by this function:
    | datetime         | customer_id | current_balance | avg_passenger_count | lifetime_trip_count | created          |
    |------------------+-------------+-----------------+---------------------+---------------------+------------------|
    df_daily = pd.DataFrame(
            "datetime": [
                pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
                for dt in pd.date_range(
                    start=start_date, end=end_date, freq="1D", closed="left"
    df_all_customers = pd.DataFrame()
    for customer in customers:
        df_daily_copy = df_daily.copy()
        df_daily_copy["customer_id"] = customer
        df_all_customers = pd.concat([df_daily_copy, df_all_customers])
    df_all_customers.reset_index(drop=True, inplace=True)
    rows = df_all_customers["datetime"].count()
    df_all_customers["current_balance"] = np.random.random(size=rows).astype(np.float32)
    df_all_customers["avg_passenger_count"] = np.random.random(size=rows).astype(
        np.float32
    df_all_customers["lifetime_trip_count"] = np.random.randint(
        0, 1000, size=rows
    ).astype(np.int32)
    # TODO: Remove created timestamp in order to test whether its really optional
    df_all_customers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms"))
    return df_all_customers
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

driver_test_data.py

Latest commit

History

driver_test_data.py

File metadata and controls