feathr-ai
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/anchored/WindowTimeUnit.scala‎
Lines changed: 2 additions & 1 deletion b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/anchored/WindowTimeUnit.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/anchored/anchorExtractor/TimeWindowConfigurableAnchorExtractor.scala‎
Lines changed: 1 addition & 1 deletion b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/anchored/anchorExtractor/TimeWindowConfigurableAnchorExtractor.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/TimeWindowFeatureDefinition.scala‎
Lines changed: 6 additions & 0 deletions b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/TimeWindowFeatureDefinition.scala‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureTransformation.scala‎
Lines changed: 14 additions & 3 deletions b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureTransformation.scala‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/join/DataFrameFeatureJoiner.scala‎
Lines changed: 9 additions & 1 deletion b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/join/DataFrameFeatureJoiner.scala‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/logical/MultiStageJoinPlanner.scala‎
Lines changed: 6 additions & 3 deletions b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/logical/MultiStageJoinPlanner.scala‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/swa/SlidingWindowFeatureUtils.scala‎
Lines changed: 2 additions & 1 deletion b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/swa/SlidingWindowFeatureUtils.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/transformation/BucketedWindowAggregationEvaluator.scala‎
Lines changed: 51 additions & 0 deletions b/‎feathr-impl/src/main/scala/com/linkedin/feathr/offline/transformation/BucketedWindowAggregationEvaluator.scala‎
Lines changed: 51 additions & 0 deletions
@@ -12,7 +12,7 @@ import java.time.Duration
  */
 private[offline] object WindowTimeUnit extends Enumeration {
   type WindowTimeUnit = Value
-  val D, H, M, S = Value
+  val D, H, M, S, W, Y = Value
 
   def parseWindowTime(timeWindowStr: String): Duration = {
     try {
@@ -22,6 +22,7 @@ private[offline] object WindowTimeUnit extends Enumeration {
         case H => Duration.ofHours(timeWindowStr.dropRight(1).trim.toLong)
         case M => Duration.ofMinutes(timeWindowStr.dropRight(1).trim.toLong)
         case S => Duration.ofSeconds(timeWindowStr.dropRight(1).trim.toLong)
+        case _ => Duration.ofSeconds(0)
       }
     } catch {
       case ex: Exception =>
 
@@ -61,7 +61,7 @@ private[offline] class TimeWindowConfigurableAnchorExtractor(@JsonProperty("feat
    */
   override def aggregateAsColumns(groupedDataFrame: DataFrame): Seq[(String, Column)] = {
     val columnPairs = aggFeatures.collect {
-      case (featureName, featureDef) =>
+      case (featureName, featureDef) if !featureDef.timeWindowFeatureDefinition.aggregationType.toString.startsWith("BUCKETED_") =>
         // for basic sliding window aggregation
         // no complex aggregation will be defined
         if (featureDef.swaFeature.lateralView.isDefined) {
 
@@ -35,6 +35,7 @@ case class TimeWindowFeatureDefinition(
     `def`: String,
     aggregationType: AggregationType.Value,
     window: Duration,
+    window_str: String,
     groupBy: Option[String],
     limit: Option[Int],
     filter: Option[String],
@@ -83,6 +84,11 @@ class TimeWindowFeatureDefinitionDeserializer extends JsonDeserializer[TimeWindo
             case _ =>
               throw new FeathrConfigException(ErrorLabel.FEATHR_USER_ERROR, s"'window' field is required in aggregation feature but is not provided $node.")
           },
+          node.get("window") match {
+            case field: TextNode => field.textValue()
+            case _ =>
+              throw new FeathrConfigException(ErrorLabel.FEATHR_USER_ERROR, s"'window' field is required in aggregation feature but is not provided $node.")
+          },
           node.get("groupBy") match {
             case field: TextNode => Option(field.textValue())
             case _ => None
 
@@ -198,15 +198,26 @@ private[offline] object FeatureTransformation {
       df: DataFrame,
       requestedFeatureRefString: Seq[String],
       inputDateInterval: Option[DateTimeInterval],
-      mvelContext: Option[FeathrExpressionExecutionContext]): TransformedResult = {
+      mvelContext: Option[FeathrExpressionExecutionContext],
+      keyColumnExprAndAlias: Seq[(String, String)] = Seq.empty[(String, String)]): TransformedResult = {
     val featureNamePrefix = getFeatureNamePrefix(featureAnchorWithSource.featureAnchor.extractor)
     val featureNamePrefixPairs = requestedFeatureRefString.map((_, featureNamePrefix))
 
     // return the feature dataframe, the feature column format and the actual(inferred or user provided) feature types
     val featureTypeConfigs = featureAnchorWithSource.featureAnchor.featureTypeConfigs
     val transformedFeatureData: TransformedResult = featureAnchorWithSource.featureAnchor.extractor match {
       case transformer: TimeWindowConfigurableAnchorExtractor =>
-        WindowAggregationEvaluator.transform(transformer, df, featureNamePrefixPairs, featureAnchorWithSource, inputDateInterval)
+        val nonBucketedFeatures = transformer.features.map(_._2.aggregationType).filter(agg => agg == AggregationType.BUCKETED_COUNT_DISTINCT)
+        if (!(nonBucketedFeatures.size != transformer.features || transformer.features.isEmpty)) {
+          throw new FeathrFeatureTransformationException(
+            ErrorLabel.FEATHR_USER_ERROR,
+            s"All features ${transformer.features.keys.mkString(",")} should be either be all bucket or non-bucketed aggregation functions.")
+        }
+        if (nonBucketedFeatures.isEmpty) {
+          WindowAggregationEvaluator.transform(transformer, df, featureNamePrefixPairs, featureAnchorWithSource, inputDateInterval)
+        } else {
+          BucketedWindowAggregationEvaluator.transform(transformer, df, featureNamePrefixPairs, featureAnchorWithSource, keyColumnExprAndAlias)
+        }
       case transformer: SimpleAnchorExtractorSpark =>
         // transform from avro tensor to FDS format, avro tensor can be shared by online/offline
         // so that transformation logic can be written only once
@@ -350,7 +361,7 @@ private[offline] object FeatureTransformation {
         (prevTransformedResult, featureAnchorWithSource) => {
           val requestedFeatures = featureAnchorWithSource.selectedFeatures
           val transformedResultWithoutKey =
-            transformSingleAnchorDF(featureAnchorWithSource, prevTransformedResult.df, requestedFeatures, inputDateInterval, mvelContext)
+            transformSingleAnchorDF(featureAnchorWithSource, prevTransformedResult.df, requestedFeatures, inputDateInterval, mvelContext, outputJoinKeyColumnNames.zip(outputJoinKeyColumnNames))
           val namePrefixPairs = prevTransformedResult.featureNameAndPrefixPairs ++ transformedResultWithoutKey.featureNameAndPrefixPairs
           val columnNameToFeatureNameAndType = prevTransformedResult.inferredFeatureTypes ++ transformedResultWithoutKey.inferredFeatureTypes
           val featureColumnFormats = prevTransformedResult.featureColumnFormats ++ transformedResultWithoutKey.featureColumnFormats
 
@@ -2,6 +2,7 @@ package com.linkedin.feathr.offline.join
 
 import com.linkedin.feathr.common._
 import com.linkedin.feathr.offline
+import com.linkedin.feathr.offline.anchored.keyExtractor.SQLSourceKeyExtractor
 import com.linkedin.feathr.offline.client.DataFrameColName
 import com.linkedin.feathr.offline.client.DataFrameColName.getFeatureAlias
 import com.linkedin.feathr.offline.config.FeatureJoinConfig
@@ -72,7 +73,14 @@ private[offline] class DataFrameFeatureJoiner(logicalPlan: MultiStageJoinPlan, d
           (dfWithFeatureNames, featureAnchorWithSourcePair) => {
             val featureAnchorWithSource = featureAnchorWithSourcePair._1
             val requestedFeatures = featureAnchorWithSourcePair._2.toSeq
-            val resultWithoutKey = transformSingleAnchorDF(featureAnchorWithSource, dfWithFeatureNames.df, requestedFeatures, None, mvelContext)
+            val keyColumnNames = featureAnchorWithSourcePair._1.featureAnchor.sourceKeyExtractor.getKeyColumnNames()
+            val keyColumnExprAndAlias = if (featureAnchorWithSourcePair._1.featureAnchor.sourceKeyExtractor.isInstanceOf[SQLSourceKeyExtractor]) {
+              val keyExprs = featureAnchorWithSourcePair._1.featureAnchor.sourceKeyExtractor.asInstanceOf[SQLSourceKeyExtractor].keyExprs
+              keyExprs.zip(keyColumnNames)
+            } else {
+              keyColumnNames.zip(keyColumnNames)
+            }
+            val resultWithoutKey = transformSingleAnchorDF(featureAnchorWithSource, dfWithFeatureNames.df, requestedFeatures, None, mvelContext, keyColumnExprAndAlias)
             val namePrefixPairs = dfWithFeatureNames.featureNameAndPrefixPairs ++ resultWithoutKey.featureNameAndPrefixPairs
             val inferredFeatureTypeConfigs = dfWithFeatureNames.inferredFeatureTypes ++ resultWithoutKey.inferredFeatureTypes
             val featureColumnFormats = resultWithoutKey.featureColumnFormats ++ dfWithFeatureNames.featureColumnFormats
 
@@ -3,14 +3,14 @@ package com.linkedin.feathr.offline.logical
 import com.linkedin.feathr.common
 import com.linkedin.feathr.common.exception.{ErrorLabel, FeathrConfigException, FeathrException}
 import com.linkedin.feathr.common.{FeatureDependencyGraph, JoiningFeatureParams}
-import com.linkedin.feathr.offline.{ErasedEntityTaggedFeature, FeatureName, JoinStage, KeyTagIdTuple}
 import com.linkedin.feathr.offline.anchored.feature.FeatureAnchorWithSource
 import com.linkedin.feathr.offline.derived.DerivedFeature
+import com.linkedin.feathr.offline.{ErasedEntityTaggedFeature, FeatureName, JoinStage, KeyTagIdTuple}
 import org.apache.logging.log4j.LogManager
 
-import scala.collection.mutable
 import scala.collection.JavaConverters._
 import scala.collection.convert.wrapAll._
+import scala.collection.mutable
 
 /**
  * Multi-stage join planner is an implementation of Logical Planner in Feathr which analyzes the requested features,
@@ -84,7 +84,10 @@ private[offline] class MultiStageJoinPlanner extends LogicalPlanner[MultiStageJo
     val allPassthroughFeatures = featureGroups.allPassthroughFeatures
     val allDerivedFeatures = featureGroups.allDerivedFeatures
 
-    val windowAggFeaturesOrdered = requiredFeatures.filter(taggedFeature => allWindowAggFeatures.contains(taggedFeature.getFeatureName))
+    val windowAggFeaturesOrdered = requiredFeatures.filter(taggedFeature =>
+      allWindowAggFeatures.contains(taggedFeature.getFeatureName) &&
+        !("PASSTHROUGH".equals(allWindowAggFeatures(taggedFeature.getFeatureName).source.path))
+    )
 
     // All required basic anchored features, basic anchored features are non-SWA features and non-passthrough features
     val requiredBasicAnchoredFeatures = requiredFeatures
 
@@ -14,7 +14,7 @@ import com.linkedin.feathr.offline.transformation.FeatureColumnFormat.FeatureCol
 import com.linkedin.feathr.offline.util.FeaturizedDatasetUtils
 import com.linkedin.feathr.offline.util.datetime.{DateTimeInterval, OfflineDateTimeUtils}
 import com.linkedin.feathr.swj.{FactData, GroupBySpec, LateralViewParams, SlidingWindowFeature, WindowSpec}
-import com.linkedin.feathr.swj.aggregate.{AggregationType, AvgAggregate, AvgPoolingAggregate, CountAggregate, CountDistinctAggregate, LatestAggregate, MaxAggregate, MaxPoolingAggregate, MinAggregate, MinPoolingAggregate, SumAggregate}
+import com.linkedin.feathr.swj.aggregate.{AggregationType, AvgAggregate, AvgPoolingAggregate, CountAggregate, CountDistinctAggregate, DummyAggregate, LatestAggregate, MaxAggregate, MaxPoolingAggregate, MinAggregate, MinPoolingAggregate, SumAggregate}
 import org.apache.logging.log4j.LogManager
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.expressions.UserDefinedFunction
@@ -186,6 +186,7 @@ private[offline] object SlidingWindowFeatureUtils {
       case AggregationType.MAX_POOLING => new MaxPoolingAggregate(featureDef)
       case AggregationType.MIN_POOLING => new MinPoolingAggregate(featureDef)
       case AggregationType.AVG_POOLING => new AvgPoolingAggregate(featureDef)
+      case AggregationType.BUCKETED_COUNT_DISTINCT => new DummyAggregate(featureDef)
     }
     swj.SlidingWindowFeature(featureName, aggregationSpec, windowSpec, filter, groupBySpec, lateralViewParams)
   }
 
@@ -0,0 +1,51 @@
+package com.linkedin.feathr.offline.transformation
+
+import com.linkedin.feathr.common.FeatureTypeConfig
+import com.linkedin.feathr.offline.anchored.anchorExtractor.TimeWindowConfigurableAnchorExtractor
+import com.linkedin.feathr.offline.anchored.feature.FeatureAnchorWithSource
+import com.linkedin.feathr.offline.job.TransformedResult
+import com.linkedin.feathr.offline.swa.SlidingWindowFeatureUtils
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+/**
+ * Evaluator that transforms features using MultiLevelAggregationTransform,
+ */
+
+private[offline] object BucketedWindowAggregationEvaluator {
+
+  /**
+   * Transform and add feature column to input dataframe using TimeWindowConfigurableAnchorExtractor
+   * @param transformer SimpleAnchorExtractorSpark implementation
+   * @param inputDf input dataframe
+   * @param requestedFeatureNameAndPrefix feature names and prefix pairs.
+   * @param featureAnchorWithSource feature anchor with source that has the transformer
+   * @return (dataframe with features, feature column format), feature column format can only be FeatureColumnFormatRAW for now
+   */
+  def transform(
+      transformer: TimeWindowConfigurableAnchorExtractor,
+      df: DataFrame,
+      requestedFeatureNameAndPrefix: Seq[(String, String)],
+      featureAnchorWithSource: FeatureAnchorWithSource,
+      keyColumnExprAndAlias: Seq[(String, String)]): TransformedResult = {
+    val ss = SparkSession.builder().getOrCreate()
+    val evaluator = new MultiLevelAggregationTransform(ss)
+    val resultDf = transformer.features.foldLeft(df)(
+      (inputDf, featureNameDefPair) => {
+        val (featureName, featureDef) = featureNameDefPair
+        val timeWindowParams = SlidingWindowFeatureUtils.getTimeWindowParam(featureAnchorWithSource.source)
+        evaluator.applyAggregate(inputDf, featureDef.`def`,
+          featureName,
+          featureDef.window_str,
+          keyColumnExprAndAlias,
+          timeWindowParams.timestampColumn,
+          timeWindowParams.timestampColumnFormat,
+          featureDef.aggregationType.toString)
+      }
+    )
+    TransformedResult(
+      requestedFeatureNameAndPrefix,
+      resultDf,
+      requestedFeatureNameAndPrefix.map(c => (c._1, FeatureColumnFormat.RAW)).toMap,
+      Map.empty[String, FeatureTypeConfig])
+  }
+}