Skip to content

Commit 1ff2660

Browse files
rakeshkashyap123Rakesh Kashyap Hanasoge Padmanabha
andauthored
Fix bug when skipping anchored features with missing data (#1164)
Co-authored-by: Rakesh Kashyap Hanasoge Padmanabha <rkashyap@rkashyap-mn3.linkedin.biz>
1 parent 7d91781 commit 1ff2660

2 files changed

Lines changed: 244 additions & 3 deletions

File tree

feathr-impl/src/main/scala/com/linkedin/feathr/offline/join/workflow/AnchoredFeatureJoinStep.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ private[offline] class AnchoredFeatureJoinStep(
111111
val withMissingFeaturesSubstituted = if (shouldAddDefault) {
112112
val missingFeatures = features.map(x => x.getFeatureName).filter(x => {
113113
val containsFeature: Seq[Boolean] = anchorDFMap.map(y => y._1.selectedFeatures.contains(x)).toSeq
114-
containsFeature.contains(false)
114+
!containsFeature.contains(true)
115115
})
116116
val missingAnchoredFeatures = ctx.featureGroups.allAnchoredFeatures.filter(featureName => missingFeatures.contains(featureName._1))
117117
substituteDefaultsForDataMissingFeatures(ctx.sparkSession, observationDF, ctx.logicalPlan,

feathr-impl/src/test/scala/com/linkedin/feathr/offline/AnchoredFeaturesIntegTest.scala

Lines changed: 243 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ class AnchoredFeaturesIntegTest extends FeathrIntegTest {
409409
| key: a_id
410410
| featureList: ["featureWithNull", "derived_featureWithNull", "featureWithNull2", "featureWithNull3", "featureWithNull4",
411411
| "featureWithNull5", "derived_featureWithNull2", "featureWithNull6", "featureWithNull7", "derived_featureWithNull7"
412-
| "aEmbedding", "memberEmbeddingAutoTZ"]
412+
| "aEmbedding", "memberEmbeddingAutoTZ", "aEmbedding", "featureWithNullSql"]
413413
| }
414414
""".stripMargin,
415415
featureDefAsString =
@@ -434,6 +434,17 @@ class AnchoredFeaturesIntegTest extends FeathrIntegTest {
434434
|}
435435
|
436436
| anchors: {
437+
| swaAnchor: {
438+
| source: "geneation/daily"
439+
| key: "x"
440+
| features: {
441+
| aEmbedding: {
442+
| def: "embedding"
443+
| aggregation: LATEST
444+
| window: 3d
445+
| }
446+
| }
447+
| }
437448
| anchor1: {
438449
| source: "anchorAndDerivations/nullVaueSource.avro.json"
439450
| key: "toUpperCaseExt(mId)"
@@ -475,6 +486,14 @@ class AnchoredFeaturesIntegTest extends FeathrIntegTest {
475486
| featureWithNull2: "isPresent(value) ? toNumeric(value) : 0"
476487
| }
477488
| }
489+
|
490+
| anchor3: {
491+
| source: "anchorAndDerivations/nullValueSource.avro.json"
492+
| key.sqlExpr: mId
493+
| features: {
494+
| featureWithNullSql.def.sqlExpr: value
495+
| }
496+
| }
478497
| swaAnchor: {
479498
| source: "swaSource"
480499
| key: "x"
@@ -530,6 +549,7 @@ class AnchoredFeaturesIntegTest extends FeathrIntegTest {
530549
Row(mutable.WrappedArray.make(Array()), mutable.WrappedArray.empty))
531550
assertEquals(featureList(0).getAs[Row]("derived_featureWithNull2"),
532551
Row(mutable.WrappedArray.make(Array("")), mutable.WrappedArray.make(Array(2.0f))))
552+
assertEquals(featureList(0).getAs[Row]("featureWithNullSql"), 1.0f)
533553
setFeathrJobParam(ADD_DEFAULT_COL_FOR_MISSING_DATA, "false")
534554
}
535555

@@ -543,7 +563,7 @@ class AnchoredFeaturesIntegTest extends FeathrIntegTest {
543563
"""
544564
| features: {
545565
| key: a_id
546-
| featureList: ["featureWithNull"]
566+
| featureList: ["platoScore", "maxLoginScore". "profileHasPicture", "ipDataCountryCode"]
547567
| }
548568
""".stripMargin,
549569
featureDefAsString =
@@ -565,6 +585,227 @@ class AnchoredFeaturesIntegTest extends FeathrIntegTest {
565585
| }
566586
| }
567587
| }
588+
|
589+
| platoFlatFeatureVector: {
590+
| source:"/tmp/fraud/jobs-fraud-model/atoscoresandfeaturesAvroPCV2"
591+
| key.sqlExpr: ["memberId", "substring(date, 0, 10)"]
592+
| features: {
593+
| platoScore: {
594+
| def.sqlExpr: "score",
595+
| default: 0.0,
596+
| type: "NUMERIC"
597+
| }
598+
| }
599+
| }
600+
| fraudJobsRatioPerIPFeatures: {
601+
| source: "/jobs/fraud/zirannia/tf_home_dir/jobs-fraud-model/fraudJobsRatioPerIPFeature"
602+
| key.sqlExpr: ["jobPostingIP", "date"]
603+
| features: {
604+
| fraudJobsIPFeaturesFedexNumJobs: {
605+
| def.sqlExpr: "numJobs",
606+
| default: -1.0,
607+
| type: "NUMERIC"
608+
| },
609+
| fraudJobsIPFeaturesFedexNumFraudJobs: {
610+
| def.sqlExpr: "numFraudJobs",
611+
| default: -1.0,
612+
| type: "NUMERIC"
613+
| },
614+
| fraudJobsIPFeaturesFedexFraudJobsRatio: {
615+
| def.sqlExpr: "fraudJobsRatio"
616+
| default: -1.0,
617+
| type: "NUMERIC"
618+
| }
619+
| }
620+
| }
621+
|
622+
| paramsFeatures: {
623+
| source: "/jobs/fraud/zirannia/tf_home_dir/jobs-fraud-model/paramsFeatures"
624+
| key.sqlExpr: ["jobId", "memberId", "substring(date, 0, 10)"]
625+
| features: {
626+
| maxLoginScore: {
627+
| def.sqlExpr: "maxLoginScore",
628+
| default: -1.0,
629+
| type: "NUMERIC"
630+
| },
631+
| registrationScore: {
632+
| def.sqlExpr: "registrationScore",
633+
| default: -1.0,
634+
| type: "NUMERIC"
635+
| },
636+
| profileHasPicture: {
637+
| def.sqlExpr: "profileHasPicture"
638+
| default: true,
639+
| type: "BOOLEAN"
640+
| },
641+
| accountageindays: {
642+
| def.sqlExpr: "accountAgeInDays",
643+
| default: 0.0,
644+
| type: "NUMERIC"
645+
| },
646+
| ipabusescore: {
647+
| def.sqlExpr: "ipAbuseScore",
648+
| default: 0.0,
649+
| type: "NUMERIC"
650+
| },
651+
| countryabusescore: {
652+
| def.sqlExpr: "countryAbuseScore",
653+
| default: 0.0,
654+
| type: "NUMERIC"
655+
| },
656+
| passwordResetFailureCountByIP: {
657+
| def.sqlExpr: "passwordResetFailureCountByIP",
658+
| default: 0.0,
659+
| type: "NUMERIC"
660+
| },
661+
| passwordResetSuccessCountByIP: {
662+
| def.sqlExpr: "passwordResetSuccessCountByIP",
663+
| default: 0.0,
664+
| type: "NUMERIC"
665+
| },
666+
| isEmailDomainReputationAbusive: {
667+
| def.sqlExpr: "isEmailDomainReputationAbusive",
668+
| default: false,
669+
| type: "BOOLEAN"
670+
| }
671+
| }
672+
| }
673+
|
674+
| restOfParamsFeatures: {
675+
| source: "/jobs/fraud/zirannia/tf_home_dir/jobs-fraud-model/restOfParamFeatures"
676+
| key.sqlExpr: ["job_id", "memberId", "substring(date, 0, 10)"]
677+
| features: {
678+
| asnIsAbusive: {
679+
| def.sqlExpr: "asnIsAbusive",
680+
| default: false,
681+
| type: "BOOLEAN"
682+
| },
683+
| jobAndMemberCountryMatch: {
684+
| def.sqlExpr: "jobAndMemberCountryMatch",
685+
| default: true,
686+
| type: "BOOLEAN"
687+
| },
688+
| companyCreationTime: {
689+
| def.sqlExpr: "companyCreationTime",
690+
| default: -1.0,
691+
| type: "NUMERIC"
692+
| },
693+
| asnIsOwnedByHostingService: {
694+
| def.sqlExpr: "asnIsOwnedByHostingService",
695+
| default: false,
696+
| type: "BOOLEAN"
697+
| },
698+
| companyFollowerCount: {
699+
| def.sqlExpr: "companyFollowerCount"
700+
| default: 0.0,
701+
| type: "NUMERIC"
702+
| },
703+
| countryMismatchGoodMemberCountWithSpecifiedAge: {
704+
| def.sqlExpr: "countryMismatchGoodMemberCountWithSpecifiedAge",
705+
| default: 0.0,
706+
| type: "NUMERIC"
707+
| },
708+
| countryMismatchRestrictedMemberCountWithSpecifiedAge: {
709+
| def.sqlExpr: "countryMismatchRestrictedMemberCountWithSpecifiedAge",
710+
| default: 0.0,
711+
| type: "NUMERIC"
712+
| },
713+
| dfpScore: {
714+
| def.sqlExpr: "dfpScore",
715+
| default: 0.0,
716+
| type: "NUMERIC"
717+
| },
718+
| ipIsOwnedByHostingService: {
719+
| def.sqlExpr: "ipIsOwnedByHostingService",
720+
| default: false,
721+
| type: "BOOLEAN"
722+
| },
723+
| isEmailDomainReputationCorpOwned: {
724+
| def.sqlExpr: "isEmailDomainReputationCorpOwned",
725+
| default: false,
726+
| type: "BOOLEAN"
727+
| },
728+
| numDaysActive: {
729+
| def.sqlExpr: "numDaysActive",
730+
| default: 0.0,
731+
| type: "NUMERIC"
732+
| },
733+
| orgIsAbusive: {
734+
| def.sqlExpr: "orgIsAbusive",
735+
| default: false,
736+
| type: "BOOLEAN"
737+
| },
738+
| orgIsOwnedByHostingService: {
739+
| def.sqlExpr: "orgIsOwnedByHostingService",
740+
| default: false,
741+
| type: "BOOLEAN"
742+
| },
743+
| postingMemberConnectionCount: {
744+
| def.sqlExpr: "postingMemberConnectionCount",
745+
| default: 0.0,
746+
| type: "NUMERIC"
747+
| },
748+
| useragentabusescore: {
749+
| def.sqlExpr: "useragentabusescore",
750+
| default: 0.0,
751+
| type: "NUMERIC"
752+
| },
753+
| jobPostingEmailDomain: {
754+
| def.sqlExpr: "jobPostingEmailDomain",
755+
| default: "",
756+
| type: "CATEGORICAL"
757+
| },
758+
| ipDataCountryCode: {
759+
| def.sqlExpr: "ipDataCountryCode",
760+
| default: "",
761+
| type: "CATEGORICAL"
762+
| },
763+
| jobPostCountryCode: {
764+
| def.sqlExpr: "jobPostCountryCode",
765+
| default: "",
766+
| type: "CATEGORICAL"
767+
| },
768+
| authorCountryCode: {
769+
| def.sqlExpr: "authorCountryCode",
770+
| default: "",
771+
| type: "CATEGORICAL"
772+
| },
773+
| wasWarmRegistration: {
774+
| def.sqlExpr: "wasWarmRegistration",
775+
| default: true,
776+
| type: "BOOLEAN"
777+
| },
778+
| emailDomainReputationRestrictionRatio: {
779+
| def.sqlExpr: "emailDomainReputationRestrictionRatio",
780+
| default: 0.0,
781+
| type: "NUMERIC"
782+
| }
783+
| }
784+
| }
785+
|
786+
| joinIpFeatures: {
787+
| source: "dalids:///u_secaggs.joinipfeatures_datepartitioned"
788+
| key.sqlExpr: ["memberid", "datepartition"]
789+
| features: {
790+
| joinIpFeaturesFedexFractionRestrictedReg: {
791+
| def.sqlExpr: "fractionrestrictedreg"
792+
| default: -1.0,
793+
| type: "NUMERIC"
794+
| }
795+
| }
796+
| }
797+
|
798+
| SafireFeatures: {
799+
| source: "/tmp/fraud/jobs-fraud-model/safirescoresandfeaturesAvroPCV2"
800+
| key.sqlExpr: ["memberId", "substring(lastActivityDate, 0, 10)"]
801+
| features: {
802+
| safireScore: {
803+
| def.sqlExpr: "versionInfo.finalScore",
804+
| default: 0.0,
805+
| type: "NUMERIC"
806+
| }
807+
| }
808+
| }
568809
|}
569810
""".stripMargin,
570811
observationDataPath = "anchorAndDerivations/testMVELLoopExpFeature-observations.csv")

0 commit comments

Comments
 (0)