diff --git a/pom.xml b/pom.xml index 6644b0f..10ded0c 100755 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.bigml - bigml-binding + bigml-binding-apricot 1.8.9 jar @@ -64,14 +64,14 @@ https://github.com/bigmlcom/bigml-java - + Clojars http://clojars.org/repo/ - + diff --git a/src/main/java/org/bigml/binding/LocalEnsemble.java b/src/main/java/org/bigml/binding/LocalEnsemble.java index c5ec38b..1b374cb 100755 --- a/src/main/java/org/bigml/binding/LocalEnsemble.java +++ b/src/main/java/org/bigml/binding/LocalEnsemble.java @@ -8,26 +8,26 @@ * reduce the latency for each prediction and let you use your models * offline. * - * Example usage (assuming that you have previously set up the - * BIGML_USERNAME and BIGML_API_KEY environment variables and that you + * Example usage (assuming that you have previously set up the + * BIGML_USERNAME and BIGML_API_KEY environment variables and that you * own the ensemble/id below): * * * import org.bigml.binding.LocalEnsemble; - * + * * // API client * BigMLClient api = new BigMLClient(); - * + * * JSONObject ensemble = api. - * getEnsemble("ensemble/5b39e6b9c7736e583400214c"); + * getEnsemble("ensemble/5b39e6b9c7736e583400214c"); * LocalEnsemble localEnsemble = new LocalEnsemble(ensemble) * * JSONObject predictors = JSONValue.parse(" - * {\"petal length\": 3, \"petal width\": 0.5, - * \"sepal length\": 1, \"sepal width\": 0.5}"); + * {\"petal length\": 3, \"petal width\": 0.5, + * \"sepal length\": 1, \"sepal width\": 0.5}"); * * localEnsemble.predict(predictors) - * + * */ package org.bigml.binding; @@ -43,1000 +43,1030 @@ /** * A local predictive Ensemble. - * + * * Uses a number of BigML remote models to build an ensemble local version that * can be used to generate prediction. - * + * */ public class LocalEnsemble extends ModelFields implements SupervisedModelInterface { - private static final long serialVersionUID = 1L; - - static String ENSEMBLE_RE = "^ensemble/[a-f,0-9]{24}$"; - - private static final int BOOSTING = 1; - - private static final String[] OPERATING_POINT_KINDS = { "probability", - "confidence", "votes" }; - - /** - * Logging - */ - static Logger logger = LoggerFactory - .getLogger(LocalEnsemble.class.getName()); - - private String ensembleId; - private String objectiveField = null; - private JSONObject boosting = null; - private JSONArray models; - private JSONObject model = null; - private List modelsSplit = new ArrayList(); - private String[] modelsIds; - private JSONArray distributions; - private JSONArray distribution; - private JSONObject importance; - private MultiModel multiModel; - private Boolean regression = false; - private JSONArray boostingOffsets; - private List classNames = new ArrayList(); - private Map fieldNames = new HashMap(); - - public LocalEnsemble(JSONObject ensemble) throws Exception { - this(ensemble, null); - } - - public LocalEnsemble(JSONObject ensemble, Integer maxModels) - throws Exception { - - super((JSONObject) Utils.getJSONObject(ensemble, "ensemble.fields", - new JSONObject())); - - // checks whether the information needed for local predictions - // is in the first argument - if (!checkModelFields(ensemble)) { - // if the fields used by the ensemble are not available, - // use only ID to retrieve it again - ensembleId = (String) ensemble.get("resource"); - boolean validId = ensembleId.matches(ENSEMBLE_RE); - if (!validId) { - throw new Exception( - ensembleId + " is not a valid resource ID."); - } - } - - if (!(ensemble.containsKey("resource") - && ensemble.get("resource") != null)) { - BigMLClient client = new BigMLClient(null, null, - BigMLClient.STORAGE); - ensemble = client.getEnsemble(ensembleId); - - if ((String) ensemble.get("resource") == null) { - throw new Exception( - ensembleId + " is not a valid resource ID."); - } - } - - if (ensemble.containsKey("object") - && ensemble.get("object") instanceof JSONObject) { - ensemble = (JSONObject) ensemble.get("object"); - } - - ensembleId = (String) ensemble.get("resource"); - - if (ensemble.containsKey("ensemble") - && ensemble.get("ensemble") instanceof JSONObject) { - - JSONObject status = (JSONObject) Utils.getJSONObject(ensemble, - "status"); - - if (status != null && status.containsKey("code") - && AbstractResource.FINISHED == ((Number) status - .get("code")).intValue()) { - - int ensebleType = ((Long) ensemble.get("type")).intValue(); - if (ensebleType == BOOSTING) { - boosting = (JSONObject) Utils.getJSONObject(ensemble, - "boosting"); - } - - JSONArray modelsJson = (JSONArray) ensemble.get("models"); - distributions = (JSONArray) ensemble.get("distributions"); - importance = (JSONObject) ensemble.get("importance"); - - int mn = modelsJson.size(); - modelsIds = new String[mn]; - for (int i = 0; i < mn; i++) { - modelsIds[i] = (String) modelsJson.get(i); - } - - JSONObject fields = (JSONObject) Utils.getJSONObject(ensemble, - "ensemble.fields", new JSONObject()); - - objectiveField = (String) Utils.getJSONObject(ensemble, - "objective_field"); - - // initialize ModelFields - super.initialize((JSONObject) fields, objectiveField, null, - null, true, true, true); - - } else { - throw new Exception("The lensemble isn't finished yet"); - } - } else { - throw new Exception( - String.format("Cannot create the Ensemble instance. " - + "Could not find the 'ensemble' key in " - + "the resource:\n\n%s", ensemble)); - } - - init(ensemble, maxModels); - } - - /** - * Constructor with a list of model references and the number of max models - * to use - * - * @param modelsIds - * the model/id of each model to be used in the ensemble - * @param maxModels - * the maximum number of models we will use in the ensemble null - * if we do not want a maxModels value - */ - public LocalEnsemble(List modelsIds, Integer maxModels) throws Exception { - this.modelsIds = (String[]) modelsIds - .toArray(new String[modelsIds.size()]); - - init(null, maxModels); - } - - protected void init(JSONObject ensemble, Integer maxModels) - throws Exception { - BigMLClient bigmlClient = new BigMLClient(); - models = new JSONArray(); - for (String id : modelsIds) { - models.add(bigmlClient.getModel(id)); - } - model = (JSONObject) models.get(0); - int numberOfModels = models.size(); - - maxModels = maxModels != null ? maxModels : numberOfModels; - int[] items = Utils.getRange(0, numberOfModels, maxModels); - for (int item : items) { - if (item + maxModels <= numberOfModels) { - JSONArray arrayOfModels = new JSONArray(); - arrayOfModels.addAll(models.subList(item, item + maxModels)); - modelsSplit.add(arrayOfModels); - } - } - - if (distributions != null) { - distributions = new JSONArray(); - for (Object model : models) { - JSONObject treDist = (JSONObject) Utils.getJSONObject( - (JSONObject) model, "model.tree.distribution"); - - if (treDist != null) { - JSONObject categories = new JSONObject(); - categories.put("categories", treDist); - JSONObject info = new JSONObject(); - info.put("training", categories); - distributions.add(info); - } else { - distributions = new JSONArray(); - break; - } - } - - if (distributions.size() == 0) { - for (Object model : models) { - distributions.add((JSONObject) Utils.getJSONObject( - (JSONObject) model, "object.model.distribution")); - } - } - } - - if (boosting == null) { - addModelsAttrs(model, maxModels); - } - - if (fields == null) { - calculateFields(); - - objectiveField = (String) Utils.getJSONObject(model, - "object.objective_field"); - } - - if (fields != null) { - JSONObject summary = (JSONObject) Utils.getJSONObject(fields, - objectiveField + ".summary"); - - if (summary != null) { - if (summary.get("bins") != null) { - distribution = (JSONArray) summary.get("bins"); - } else if (summary.get("counts") != null) { - distribution = (JSONArray) summary.get("counts"); - } else if (summary.get("categories") != null) { - distribution = (JSONArray) summary.get("categories"); - } - } - } - - String optype = (String) Utils.getJSONObject(fields, - objectiveField + ".optype"); - regression = "numeric".equals(optype); - if (boosting != null) { - if (regression) { - Double boostingOffset = ((Number) ensemble.get("initial_offset")).doubleValue(); - boostingOffsets = new JSONArray(); - boostingOffsets.add(boostingOffset); - } else { - boostingOffsets = (JSONArray) ensemble.get("initial_offsets"); - } - } - - if (!regression) { - JSONObject summary = (JSONObject) Utils.getJSONObject( - (JSONObject) fields.get(objectiveField), "summary"); - if (summary != null) { - JSONArray categories = (JSONArray) Utils.getJSONObject(summary, - "categories", new JSONArray()); - - for (Object cat : categories) { - classNames.add((String) ((JSONArray) cat).get(0)); - } - Collections.sort(classNames); - } - } - - if (modelsSplit.size() == 1) { - multiModel = new MultiModel(models, fields, classNames); - } - } - - /** - * Returns the resourceId - */ - public String getResourceId() { - return ensembleId; - } - - /** - * Returns the class names - */ - public List getClassNames() { - return classNames; - } - - /** - * Calculates the full list of fields used by this ensemble. It's obtained - * from the union of fields in all models of the ensemble. - */ - protected void calculateFields() { - fields = new JSONObject(); - fieldNames.clear(); - - for (int i = 0; i < this.modelsIds.length; i++) { - JSONObject model = (JSONObject) this.models.get(i); - JSONObject fields = (JSONObject) Utils.getJSONObject(model, - "object.model.fields"); - for (Object k : fields.keySet()) { - if (null != fields.get(k)) { - String fieldName = (String) ((JSONObject) fields.get(k)) - .get("name"); - this.fields.put(k, fields.get(k)); - this.fieldNames.put((String) k, fieldName); - } - } - } - } - - /** - * Adds the boosting and fields info when the ensemble is built from a list - * of models. They can be either Model objects or the model dictionary info - * structure. - */ - private void addModelsAttrs(JSONObject model, Integer maxModels) { - - boolean boostedEnsemble = (Boolean) Utils.getJSONObject(model, - "object.boosted_ensemble", false); - if (boostedEnsemble) { - this.boosting = (JSONObject) Utils.getJSONObject(model, - "object.boosting", null); - } - - if (this.boosting != null) { - throw new IllegalArgumentException( - "Failed to build the local ensemble. Boosted" - + "ensembles cannot be built from a list of " - + "boosting models."); - } - - if (fields == null) { - allModelsFields(maxModels); - objectiveFieldId = (String) Utils.getJSONObject(model, - "object.objective_field", null); - } - - } - - /** - * Retrieves the fields used as predictors in all the ensemble models - */ - private void allModelsFields(Integer maxModels) { - try { - for (Object split : modelsSplit.get(0)) { - LocalPredictiveModel localModel = new LocalPredictiveModel( - (JSONObject) split); - fields.putAll(localModel.getFields()); - } - } catch (Exception e) {} - } - - /** - * Computes the predicted distributions and combines them to give the final - * predicted distribution. Depending on the method parameter probability, - * votes or the confidence are used to weight the models. - * - */ - private List combineDistributions(JSONObject inputData, - MissingStrategy missingStrategy, PredictionMethod method) - throws Exception { - - if (method == null) { - method = PredictionMethod.PROBABILITY; - } - - MultiVoteList votes = null; - - if (modelsSplit != null && modelsSplit.size() > 1) { - // If there's more than one chunk of models, they must be - // sequentially used to generate the votes for the prediction - votes = new MultiVoteList(null); - - for (Object split : modelsSplit) { - JSONArray models = (JSONArray) split; - MultiModel multiModel = new MultiModel(models, fields, - classNames); - MultiVoteList modelVotes = multiModel.generateVotesDistribution( - inputData, missingStrategy, method); - votes.extend(modelVotes); - } - } else { - // When only one group of models is found you use the - // corresponding multimodel to predict - votes = multiModel.generateVotesDistribution(inputData, - missingStrategy, method); - } - - return votes.combineToDistribution(false); - } - - /** - * Computes field importance based on the field importance information of - * the individual models in the ensemble. - */ - public List getFieldImportanceData() { - - Map fieldImportance = new HashMap(); - - if (importance != null) { - fieldImportance = importance; - } else { - boolean useDistribution = false; - List importances = new ArrayList(); - - if (distributions != null && distributions.size() > 0) { - useDistribution = true; - for (Object item : distributions) { - JSONObject itemObj = (JSONObject) item; - useDistribution &= itemObj.containsKey("importance"); - if (!useDistribution) - break; - else { - importances.add((JSONArray) itemObj.get("importance")); - } - } - } - - if (useDistribution) { - for (JSONArray importance : importances) { - JSONArray importanceInfo = (JSONArray) importance; - - for (Object fieldInfo : importanceInfo) { - JSONArray fieldInfoArr = (JSONArray) fieldInfo; - String fieldId = (String) fieldInfoArr.get(0); - if (!fieldImportance.containsKey(fieldId)) { - fieldImportance.put(fieldId, 0.0); - String fieldName = (String) ((JSONObject) fields - .get(fieldId)).get("name"); - - JSONObject fieldNameObj = new JSONObject(); - fieldNameObj.put("name", fieldName); - } - - fieldImportance.put(fieldId, - fieldImportance.get(fieldId) - + ((Number) fieldInfoArr.get(1)) - .doubleValue()); - } - } - } else { - for (Object model : models) { - JSONObject modelObj = (JSONObject) model; - JSONArray fieldImportanceInfo = (JSONArray) Utils - .getJSONObject(modelObj, "object.model.importance"); - ; - for (Object fieldInfo : fieldImportanceInfo) { - JSONArray fieldInfoArr = (JSONArray) fieldInfo; - String fieldId = (String) fieldInfoArr.get(0); - - if (!fieldImportance.containsKey(fieldId)) { - fieldImportance.put(fieldId, 0.0); - - String fieldName = (String) ((JSONObject) fields - .get(fieldId)).get("name"); - - JSONObject fieldNameObj = new JSONObject(); - fieldNameObj.put("name", fieldName); - } - - fieldImportance.put(fieldId, - fieldImportance.get(fieldId) - + ((Number) fieldInfoArr.get(1)) - .doubleValue()); - } - } - } - - for (String fieldName : fieldImportance.keySet()) { - fieldImportance.put(fieldName, - fieldImportance.get(fieldName) / models.size()); - } - - } - - List fieldImportanceOrdered = new ArrayList(); - for (String fieldName : fieldImportance.keySet()) { - JSONArray fieldInfo = new JSONArray(); - fieldInfo.add(fieldName); - fieldInfo.add(fieldImportance.get(fieldName)); - fieldImportanceOrdered.add(fieldInfo); - } - - Collections.sort(fieldImportanceOrdered, new Comparator() { - @Override - public int compare(JSONArray jsonArray, JSONArray jsonArray2) { - return (((Number) jsonArray.get(1)) - .doubleValue() > ((Number) jsonArray2.get(1)) - .doubleValue() ? -1 : 1); - } - }); - - return fieldImportanceOrdered; - } - - /** - * Returns the required data distribution by adding the distributions in the - * models - */ - public JSONArray getDataDistribution(String distributionType) { - if (distributionType == null) { - distributionType = "training"; - } - - JSONObject categories = new JSONObject(); - if (distributions != null && distributions.size() > 0) { - for (Object item : distributions) { - JSONObject modelDist = (JSONObject) item; - JSONObject summary = (JSONObject) modelDist - .get(distributionType); - - JSONArray dist = new JSONArray(); - if (summary != null) { - if (summary.get("bins") != null) { - dist = (JSONArray) summary.get("bins"); - } else if (summary.get("counts") != null) { - dist = (JSONArray) summary.get("counts"); - } else if (summary.get("categories") != null) { - dist = (JSONArray) summary.get("categories"); - } - } - - for (Object distr : dist) { - JSONArray distInfo = (JSONArray) distr; - String category = (String) distInfo.get(0); - Long instances = (Long) distInfo.get(1); - - if (categories.containsKey(category)) { - Long current = (Long) categories.get(category); - categories.put(category, current + instances); - } else { - categories.put(category, instances); - } - } - - } - } - - JSONArray distribution = new JSONArray(); - for (Object cat : categories.keySet()) { - String category = (String) cat; - JSONArray item = new JSONArray(); - item.add(category); - item.add(categories.get(category)); - distribution.add(item); - } - - sortDistribution(distribution); - return distribution; - } - - /** - * Prints ensemble summary. Only field importance at present. - * - */ - public String summarize() throws IOException { - StringBuilder summarize = new StringBuilder(); - - JSONArray distribution = getDataDistribution("training"); - if (!distribution.isEmpty()) { - summarize.append("Data distribution:\n"); - summarize.append(Utils.printDistribution(distribution).toString()); - summarize.append("\n\n"); - } - - if (this.boosting == null) { - JSONArray predictions = getDataDistribution("predictions"); - if (!predictions.isEmpty()) { - summarize.append("Predicted distribution:\n"); - summarize.append( - Utils.printDistribution(distribution).toString()); - summarize.append("\n\n"); - } - } - - summarize.append("Field importance:\n"); - - distribution = new JSONArray(); - for (Object fieldItem : importance.keySet()) { - String fieldId = (String) fieldItem; - double value = (Double) importance.get(fieldId); - - JSONArray item = new JSONArray(); - item.add(fieldId); - item.add(value); - distribution.add(item); - } - - sortDistribution(distribution); - - int count = 1; - for (Object fieldItem : distribution) { - JSONArray field = (JSONArray) fieldItem; - - String fieldId = (String) field.get(0); - double value = (Double) field.get(1); - summarize.append(String.format(" %s. %s: %.2f%%\n", count++, - Utils.getJSONObject(fields, fieldId + ".name"), - (Utils.roundOff(value, 4) * 100))); - } - - return summarize.toString(); - } - - /** - * Sorting utility - * - */ - private void sortDistribution(JSONArray distribution) { - Collections.sort(distribution, new Comparator() { - @Override - public int compare(JSONArray o1, JSONArray o2) { - Object o1Val = o1.get(1); - Object o2Val = o2.get(1); - - if (o1Val instanceof Number) { - o1Val = ((Number) o1Val).doubleValue(); - o2Val = ((Number) o2Val).doubleValue(); - } - - return ((Comparable) o2Val).compareTo(o1Val); - } - }); - } - - /** - * Makes a prediction based on the prediction made by every model. - * - * @param inputData - * Input data to be predicted. - * @param method - * **deprecated**. Please check the operating_kind` attribute. - * Numeric key code for the following combination methods in - * classifications/regressions: 0 - majority vote (plurality)/ - * average: PLURALITY_CODE 1 - confidence weighted majority vote - * / error weighted: CONFIDENCE_CODE 2 - probability weighted - * majority vote / average: PROBABILITY_CODE 3 - threshold - * filtered vote / doesn't apply: THRESHOLD_CODE - * @param options Options to be used in threshold filtered votes. - * @param missingStrategy numeric key for the individual model's prediction - * method. See the model predict method. - * @param operatingPoint - * In classification models, this is the point of the ROC curve - * where the model will be used at. The operating point can be - * defined in terms of: - the positive_class, the class that is - * important to predict accurately - its kind: probability, - * confidence or voting - its threshold: the minimum established - * for the positive_class to be predicted. The operating_point is - * then defined as a map with three attributes, e.g.: - * {"positive_class": "Iris-setosa", "kind": "probability", - * "threshold": 0.5} - * @param operatingKind - * probability", "confidence" or "votes". Sets the property that - * decides the prediction. Used only if no operating_point is - * used - * @param median - * Uses the median of each individual model's predicted node as - * individual prediction for the specified combination method. - * @param full - * Boolean that controls whether to include the prediction's - * attributes. By default, only the prediction is produced. If - * set to True, the rest of available information is added in a - * dictionary format. The dictionary keys can be: - prediction: - * the prediction value - probability: prediction's probability - - * distribution: distribution of probabilities for each of the - * objective field classes - unused_fields: list of fields in the - * input data that - * - */ - public HashMap predict(JSONObject inputData, - PredictionMethod method, Map options, - MissingStrategy missingStrategy, - JSONObject operatingPoint, String operatingKind, - Boolean median, Boolean full) throws Exception { - - if (missingStrategy == null) { - missingStrategy = MissingStrategy.LAST_PREDICTION; - } - - if (median == null) { - median = false; - } - - if (full == null) { - full = false; - } - - // Checks and cleans inputData leaving the fields used in the model - inputData = filterInputData(inputData, full); - - List unusedFields = (List) inputData - .get("unusedFields"); - inputData = (JSONObject) inputData.get("newInputData"); - - // Strips affixes for numeric values and casts to the final field type - Utils.cast(inputData, fields); - - if (median && method == null) { - // predictions with median are only available with old combiners - method = PredictionMethod.PLURALITY; - } - - if (method == null && operatingPoint == null && operatingKind == null - && !median) { - // operating_point has precedence over operating_kind. If no - // combiner is set, default operating kind is "probability" - operatingKind = "probability"; - } - - // Operating Point - if (operatingPoint != null) { - if (regression) { - throw new IllegalArgumentException( - "The operatingPoint argument can only be" - + " used in classifications."); - } - - return predictOperating(inputData, missingStrategy, operatingPoint); - } - - if (operatingKind != null) { - if (regression) { - // for regressions, operating_kind defaults to the old combiners - method = "confidence".equals(operatingKind) - ? PredictionMethod.CONFIDENCE - : PredictionMethod.PLURALITY; - - return predict(inputData, method, options, missingStrategy, - null, null, null, full); - } else { - // predict operating point - return predictOperatingKind(inputData, missingStrategy, - operatingKind); - } - } - - MultiVote votes = null; - if (modelsSplit != null && modelsSplit.size() > 1) { - // If there's more than one chunk of models, they must be - // sequentially used to generate the votes for the prediction - votes = new MultiVote(); - for (Object split : modelsSplit) { - JSONArray models = (JSONArray) split; - MultiModel multiModel = new MultiModel(models, fields, null); - - MultiVote modelVotes = multiModel.generateVotes(inputData, - missingStrategy, unusedFields); - votes.extend(modelVotes); - } - } else { - // When only one group of models is found you use the - // corresponding multimodel to predict - MultiVote votesSplit = this.multiModel.generateVotes(inputData, - missingStrategy, unusedFields); - - votes = new MultiVote(votesSplit.predictions, boostingOffsets); - } - - if (this.boosting != null && !this.regression) { - options = new HashMap(); - JSONArray categories = (JSONArray) Utils.getJSONObject( - (JSONObject) fields.get(objectiveField), - "summary.categories", new JSONArray()); - options.put("categories", categories); - } - - HashMap results = votes.combine(method, options); - - HashMap prediction = new HashMap(); - for (Object key : results.keySet()) { - prediction.put((String) key, results.get(key)); - } - - if (full) { - prediction.put("unused_fields", unusedFields); - } - - return prediction; - } - - /** - * Computes the prediction based on a user-given operating point. - */ - private HashMap predictOperating(JSONObject inputData, - MissingStrategy missingStrategy, JSONObject operatingPoint) - throws Exception { - - if (missingStrategy == null) { - missingStrategy = MissingStrategy.LAST_PREDICTION; - } - - Object[] operating = Utils.parseOperatingPoint(operatingPoint, - OPERATING_POINT_KINDS, classNames); - - String kind = (String) operating[0]; - Double threshold = (Double) operating[1]; - String positiveClass = (String) operating[2]; - - if (!Arrays.asList(OPERATING_POINT_KINDS).contains(kind)) { - throw new IllegalArgumentException(String.format( - "Allowed operating kinds are %", OPERATING_POINT_KINDS)); - } - - JSONArray predictions = null; - if (kind.equals("probability")) { - predictions = predictProbability(inputData, missingStrategy); - } - if (kind.equals("confidence")) { - predictions = predictConfidence(inputData, missingStrategy); - } - if (kind.equals("votes")) { - predictions = predictVotes(inputData, missingStrategy); - } - - for (Object pred : predictions) { - HashMap prediction - = (HashMap) pred; - String category = (String) prediction.get("category"); - - prediction.put("prediction", prediction.get("category")); - prediction.remove("category"); - - if (category.equals(positiveClass) - && (Double) prediction.get(kind) > threshold) { - return prediction; - } - } - - HashMap prediction - = (HashMap) predictions.get(0); - String category = (String) prediction.get("prediction"); - if (category.equals(positiveClass)) { - prediction = (HashMap) predictions.get(1); - } - - return prediction; - } - - /** - * Computes the prediction based on a user-given operating kind, i.e, - * confidence, probability or votes. - */ - private HashMap predictOperatingKind(JSONObject inputData, - MissingStrategy missingStrategy, String operatingKind) - throws Exception { - - if (missingStrategy == null) { - missingStrategy = MissingStrategy.LAST_PREDICTION; - } - - String kind = operatingKind.toLowerCase(); - - if (boosting != null && !"probability".equals(kind)) { - throw new IllegalArgumentException( - "Only probability is allowed as operating kind " - + "for boosted ensembles."); - } - - if (!Arrays.asList(OPERATING_POINT_KINDS).contains(kind)) { - throw new IllegalArgumentException(String.format( - "Allowed operating kinds are %", OPERATING_POINT_KINDS)); - } - - JSONArray predictions = null; - if (kind.equals("probability")) { - predictions = predictProbability(inputData, missingStrategy); - } - if (kind.equals("confidence")) { - predictions = predictConfidence(inputData, missingStrategy); - } - if (kind.equals("votes")) { - predictions = predictVotes(inputData, missingStrategy); - } - - HashMap prediction - = (HashMap) predictions.get(0); - prediction.put("prediction", prediction.get("category")); - prediction.remove("category"); - - return prediction; - } - - - /** - * For classification models, Predicts a probability for each possible - * output class, based on input values. The input fields must be a - * dictionary keyed by field name or field ID. - * - * For regressions, the output is a single element list containing the - * prediction. - * - * @param inputData - * Input data to be predicted - * @param missingStrategy - * LAST_PREDICTION|PROPORTIONAL missing strategy for missing - * fields - */ - public JSONArray predictProbability(JSONObject inputData, - MissingStrategy missingStrategy) throws Exception { - - JSONArray predictions = new JSONArray(); - HashMap prediction = null; - if (regression) { - prediction = predict(inputData, PredictionMethod.PROBABILITY, null, - missingStrategy, null, null, null, true); - predictions.add(prediction); - } else { - if (boosting != null) { - prediction = predict(inputData, PredictionMethod.PLURALITY, - null, missingStrategy, null, null, null, true); - JSONArray probabilities = (JSONArray) prediction - .get("probabilities"); - predictions.add(probabilities); - } else { - List output = combineDistributions(inputData, - missingStrategy, null); - - for (int i = 0; i < classNames.size(); i++) { - prediction = new JSONObject(); - prediction.put("category", (String) classNames.get(i)); - prediction.put("probability", output.get(i)); - predictions.add(prediction); - } - } - } - - Utils.sortPredictions(predictions, "probability", "category"); - return predictions; - - } - - /** - * For classification models, Predicts a confidence for each possible output - * class, based on input values. The input fields must be a dictionary keyed - * by field name or field ID. - * - * For regressions, the output is a single element list containing the - * prediction. - * - * @param inputData - * Input data to be predicted - * @param missingStrategy - * LAST_PREDICTION|PROPORTIONAL missing strategy for missing - * fields - */ - private JSONArray predictConfidence(JSONObject inputData, - MissingStrategy missingStrategy) throws Exception { - - if (boosting != null) { - // we use boosting probabilities as confidences also - return predictProbability(inputData, missingStrategy); - } - - JSONArray predictions = new JSONArray(); - HashMap prediction = null; - if (regression) { - prediction = predict(inputData, PredictionMethod.CONFIDENCE, null, - missingStrategy, null, null, null, true); - predictions.add(prediction); - } else { - List output = combineDistributions(inputData, - missingStrategy, PredictionMethod.CONFIDENCE); - - for (int i = 0; i < classNames.size(); i++) { - prediction = new JSONObject(); - prediction.put("category", (String) classNames.get(i)); - prediction.put("confidence", output.get(i)); - predictions.add(prediction); - } - } - - Utils.sortPredictions(predictions, "confidence", "category"); - return predictions; - } - - /** - * For classification models, Predicts the votes for each possible output - * class, based on input values. The input fields must be a dictionary keyed - * by field name or field ID. - * - * For regressions, the output is a single element list containing the - * prediction. - * - * @param inputData - * Input data to be predicted - * @param missingStrategy - * LAST_PREDICTION|PROPORTIONAL missing strategy for missing - * fields - */ - private JSONArray predictVotes(JSONObject inputData, - MissingStrategy missingStrategy) throws Exception { - - JSONArray predictions = new JSONArray(); - HashMap prediction = null; - if (regression) { - prediction = predict(inputData, PredictionMethod.PLURALITY, null, - missingStrategy, null, null, null, true); - predictions.add(prediction); - } else { - if (boosting != null) { - throw new IllegalArgumentException( - "Votes cannot be computed for boosted ensembles."); - } else { - List output = combineDistributions(inputData, - missingStrategy, PredictionMethod.PLURALITY); - - for (int i = 0; i < classNames.size(); i++) { - prediction = new JSONObject(); - prediction.put("category", (String) classNames.get(i)); - prediction.put("votes", output.get(i)); - predictions.add(prediction); - } - } - } - - Utils.sortPredictions(predictions, "votes", "category"); - return predictions; - } + private static final long serialVersionUID = 1L; + + static String ENSEMBLE_RE = "^ensemble/[a-f,0-9]{24}$"; + + private static final int BOOSTING = 1; + + private static final String[] OPERATING_POINT_KINDS = { "probability", + "confidence", "votes" }; + + /** + * Logging + */ + static Logger logger = LoggerFactory + .getLogger(LocalEnsemble.class.getName()); + + private String ensembleId; + private BigMLClient bigmlClient; + private String objectiveField = null; + private JSONObject boosting = null; + private JSONArray models; + private JSONObject model = null; + private List modelsSplit = new ArrayList(); + private String[] modelsIds; + private JSONArray distributions; + private JSONArray distribution; + private JSONObject importance; + private MultiModel multiModel; + private Boolean regression = false; + private JSONArray boostingOffsets; + private List classNames = new ArrayList(); + private Map fieldNames = new HashMap(); + + public LocalEnsemble(JSONObject ensemble) throws Exception { + this(null, ensemble); + } + + public LocalEnsemble(BigMLClient bigmlClient, JSONObject ensemble) throws Exception { + this(bigmlClient, ensemble, null); + } + + public LocalEnsemble(JSONObject ensemble, Integer maxModels) throws Exception { + this(null, ensemble, maxModels); + } + + public LocalEnsemble(BigMLClient bigmlClient, JSONObject ensemble, Integer maxModels) + throws Exception { + + super((JSONObject) Utils.getJSONObject(ensemble, "ensemble.fields", + new JSONObject())); + + this.bigmlClient = + (bigmlClient != null) + ? bigmlClient + : new BigMLClient(null, null, BigMLClient.STORAGE); + + // checks whether the information needed for local predictions + // is in the first argument + if (!checkModelFields(ensemble)) { + // if the fields used by the ensemble are not available, + // use only ID to retrieve it again + ensembleId = (String) ensemble.get("resource"); + boolean validId = ensembleId.matches(ENSEMBLE_RE); + if (!validId) { + throw new Exception( + ensembleId + " is not a valid resource ID."); + } + } + + if (!(ensemble.containsKey("resource") + && ensemble.get("resource") != null)) { + ensemble = this.bigmlClient.getEnsemble(ensembleId); + + if ((String) ensemble.get("resource") == null) { + throw new Exception( + ensembleId + " is not a valid resource ID."); + } + } + + if (ensemble.containsKey("object") + && ensemble.get("object") instanceof JSONObject) { + ensemble = (JSONObject) ensemble.get("object"); + } + + ensembleId = (String) ensemble.get("resource"); + + if (ensemble.containsKey("ensemble") + && ensemble.get("ensemble") instanceof JSONObject) { + + JSONObject status = (JSONObject) Utils.getJSONObject(ensemble, + "status"); + + if (status != null && status.containsKey("code") + && AbstractResource.FINISHED == ((Number) status + .get("code")).intValue()) { + + int ensebleType = ((Long) ensemble.get("type")).intValue(); + if (ensebleType == BOOSTING) { + boosting = (JSONObject) Utils.getJSONObject(ensemble, + "boosting"); + } + + JSONArray modelsJson = (JSONArray) ensemble.get("models"); + distributions = (JSONArray) ensemble.get("distributions"); + importance = (JSONObject) ensemble.get("importance"); + + int mn = modelsJson.size(); + modelsIds = new String[mn]; + for (int i = 0; i < mn; i++) { + modelsIds[i] = (String) modelsJson.get(i); + } + + JSONObject fields = (JSONObject) Utils.getJSONObject(ensemble, + "ensemble.fields", new JSONObject()); + + objectiveField = (String) Utils.getJSONObject(ensemble, + "objective_field"); + + // initialize ModelFields + super.initialize((JSONObject) fields, objectiveField, null, + null, true, true, true); + + } else { + throw new Exception("The lensemble isn't finished yet"); + } + } else { + throw new Exception( + String.format("Cannot create the Ensemble instance. " + + "Could not find the 'ensemble' key in " + + "the resource:\n\n%s", ensemble)); + } + + init(ensemble, maxModels); + } + + /** + * Constructor with a list of model references and the number of max models + * to use + * + * @param modelsIds + * the model/id of each model to be used in the ensemble + * @param maxModels + * the maximum number of models we will use in the ensemble null + * if we do not want a maxModels value + */ + public LocalEnsemble(List modelsIds, Integer maxModels) throws Exception { + this(null, modelsIds, maxModels); + } + + public LocalEnsemble(BigMLClient bigmlClient, List modelsIds, Integer maxModels) throws Exception { + this.bigmlClient = + (bigmlClient != null) + ? bigmlClient + : new BigMLClient(null, null, BigMLClient.STORAGE); + + this.modelsIds = (String[]) modelsIds + .toArray(new String[modelsIds.size()]); + + init(null, maxModels); + } + + protected void init(JSONObject ensemble, Integer maxModels) + throws Exception { + models = new JSONArray(); + for (String id : modelsIds) { + models.add(this.bigmlClient.getModel(id)); + } + model = (JSONObject) models.get(0); + int numberOfModels = models.size(); + + maxModels = maxModels != null ? maxModels : numberOfModels; + int[] items = Utils.getRange(0, numberOfModels, maxModels); + for (int item : items) { + if (item + maxModels <= numberOfModels) { + JSONArray arrayOfModels = new JSONArray(); + arrayOfModels.addAll(models.subList(item, item + maxModels)); + modelsSplit.add(arrayOfModels); + } + } + + if (distributions != null) { + distributions = new JSONArray(); + for (Object model : models) { + JSONObject treDist = (JSONObject) Utils.getJSONObject( + (JSONObject) model, "model.tree.distribution"); + + if (treDist != null) { + JSONObject categories = new JSONObject(); + categories.put("categories", treDist); + JSONObject info = new JSONObject(); + info.put("training", categories); + distributions.add(info); + } else { + distributions = new JSONArray(); + break; + } + } + + if (distributions.size() == 0) { + for (Object model : models) { + distributions.add((JSONObject) Utils.getJSONObject( + (JSONObject) model, "object.model.distribution")); + } + } + } + + if (boosting == null) { + addModelsAttrs(model, maxModels); + } + + if (fields == null) { + calculateFields(); + + objectiveField = (String) Utils.getJSONObject(model, + "object.objective_field"); + } + + if (fields != null) { + JSONObject summary = (JSONObject) Utils.getJSONObject(fields, + objectiveField + ".summary"); + + if (summary != null) { + if (summary.get("bins") != null) { + distribution = (JSONArray) summary.get("bins"); + } else if (summary.get("counts") != null) { + distribution = (JSONArray) summary.get("counts"); + } else if (summary.get("categories") != null) { + distribution = (JSONArray) summary.get("categories"); + } + } + } + + String optype = (String) Utils.getJSONObject(fields, + objectiveField + ".optype"); + regression = "numeric".equals(optype); + if (boosting != null) { + if (regression) { + Double boostingOffset = ((Number) ensemble.get("initial_offset")).doubleValue(); + boostingOffsets = new JSONArray(); + boostingOffsets.add(boostingOffset); + } else { + boostingOffsets = (JSONArray) ensemble.get("initial_offsets"); + } + } + + if (!regression) { + JSONObject summary = (JSONObject) Utils.getJSONObject( + (JSONObject) fields.get(objectiveField), "summary"); + if (summary != null) { + JSONArray categories = (JSONArray) Utils.getJSONObject(summary, + "categories", new JSONArray()); + + for (Object cat : categories) { + classNames.add((String) ((JSONArray) cat).get(0)); + } + Collections.sort(classNames); + } + } + + if (modelsSplit.size() == 1) { + multiModel = new MultiModel(models, fields, classNames); + } + } + + /** + * Returns the resourceId + */ + public String getResourceId() { + return ensembleId; + } + + /** + * Returns the class names + */ + public List getClassNames() { + return classNames; + } + + /** + * Calculates the full list of fields used by this ensemble. It's obtained + * from the union of fields in all models of the ensemble. + */ + protected void calculateFields() { + fields = new JSONObject(); + fieldNames.clear(); + + for (int i = 0; i < this.modelsIds.length; i++) { + JSONObject model = (JSONObject) this.models.get(i); + JSONObject fields = (JSONObject) Utils.getJSONObject(model, + "object.model.fields"); + for (Object k : fields.keySet()) { + if (null != fields.get(k)) { + String fieldName = (String) ((JSONObject) fields.get(k)) + .get("name"); + this.fields.put(k, fields.get(k)); + this.fieldNames.put((String) k, fieldName); + } + } + } + } + + /** + * Adds the boosting and fields info when the ensemble is built from a list + * of models. They can be either Model objects or the model dictionary info + * structure. + */ + private void addModelsAttrs(JSONObject model, Integer maxModels) { + + boolean boostedEnsemble = (Boolean) Utils.getJSONObject(model, + "object.boosted_ensemble", false); + if (boostedEnsemble) { + this.boosting = (JSONObject) Utils.getJSONObject(model, + "object.boosting", null); + } + + if (this.boosting != null) { + throw new IllegalArgumentException( + "Failed to build the local ensemble. Boosted" + + "ensembles cannot be built from a list of " + + "boosting models."); + } + + if (fields == null) { + allModelsFields(maxModels); + objectiveFieldId = (String) Utils.getJSONObject(model, + "object.objective_field", null); + } + + } + + /** + * Retrieves the fields used as predictors in all the ensemble models + */ + private void allModelsFields(Integer maxModels) { + try { + for (Object split : modelsSplit.get(0)) { + LocalPredictiveModel localModel = new LocalPredictiveModel( + (JSONObject) split); + fields.putAll(localModel.getFields()); + } + } catch (Exception e) {logger.error("Not good", e);} + } + + /** + * Computes the predicted distributions and combines them to give the final + * predicted distribution. Depending on the method parameter probability, + * votes or the confidence are used to weight the models. + * + */ + private List combineDistributions(JSONObject inputData, + MissingStrategy missingStrategy, PredictionMethod method) + throws Exception { + + if (method == null) { + method = PredictionMethod.PROBABILITY; + } + + MultiVoteList votes = null; + + if (modelsSplit != null && modelsSplit.size() > 1) { + // If there's more than one chunk of models, they must be + // sequentially used to generate the votes for the prediction + votes = new MultiVoteList(null); + + for (Object split : modelsSplit) { + JSONArray models = (JSONArray) split; + MultiModel multiModel = new MultiModel(models, fields, + classNames); + MultiVoteList modelVotes = multiModel.generateVotesDistribution( + inputData, missingStrategy, method); + votes.extend(modelVotes); + } + } else { + // When only one group of models is found you use the + // corresponding multimodel to predict + votes = multiModel.generateVotesDistribution(inputData, + missingStrategy, method); + } + + return votes.combineToDistribution(false); + } + + /** + * Computes field importance based on the field importance information of + * the individual models in the ensemble. + */ + public List getFieldImportanceData() { + + Map fieldImportance = new HashMap(); + + if (importance != null) { + fieldImportance = importance; + } else { + boolean useDistribution = false; + List importances = new ArrayList(); + + if (distributions != null && distributions.size() > 0) { + useDistribution = true; + for (Object item : distributions) { + JSONObject itemObj = (JSONObject) item; + useDistribution &= itemObj.containsKey("importance"); + if (!useDistribution) + break; + else { + importances.add((JSONArray) itemObj.get("importance")); + } + } + } + + if (useDistribution) { + for (JSONArray importance : importances) { + JSONArray importanceInfo = (JSONArray) importance; + + for (Object fieldInfo : importanceInfo) { + JSONArray fieldInfoArr = (JSONArray) fieldInfo; + String fieldId = (String) fieldInfoArr.get(0); + if (!fieldImportance.containsKey(fieldId)) { + fieldImportance.put(fieldId, 0.0); + String fieldName = (String) ((JSONObject) fields + .get(fieldId)).get("name"); + + JSONObject fieldNameObj = new JSONObject(); + fieldNameObj.put("name", fieldName); + } + + fieldImportance.put(fieldId, + fieldImportance.get(fieldId) + + ((Number) fieldInfoArr.get(1)) + .doubleValue()); + } + } + } else { + for (Object model : models) { + JSONObject modelObj = (JSONObject) model; + JSONArray fieldImportanceInfo = (JSONArray) Utils + .getJSONObject(modelObj, "object.model.importance"); + ; + for (Object fieldInfo : fieldImportanceInfo) { + JSONArray fieldInfoArr = (JSONArray) fieldInfo; + String fieldId = (String) fieldInfoArr.get(0); + + if (!fieldImportance.containsKey(fieldId)) { + fieldImportance.put(fieldId, 0.0); + + String fieldName = (String) ((JSONObject) fields + .get(fieldId)).get("name"); + + JSONObject fieldNameObj = new JSONObject(); + fieldNameObj.put("name", fieldName); + } + + fieldImportance.put(fieldId, + fieldImportance.get(fieldId) + + ((Number) fieldInfoArr.get(1)) + .doubleValue()); + } + } + } + + for (String fieldName : fieldImportance.keySet()) { + fieldImportance.put(fieldName, + fieldImportance.get(fieldName) / models.size()); + } + + } + + List fieldImportanceOrdered = new ArrayList(); + for (String fieldName : fieldImportance.keySet()) { + JSONArray fieldInfo = new JSONArray(); + fieldInfo.add(fieldName); + fieldInfo.add(fieldImportance.get(fieldName)); + fieldImportanceOrdered.add(fieldInfo); + } + + Collections.sort(fieldImportanceOrdered, new Comparator() { + @Override + public int compare(JSONArray jsonArray, JSONArray jsonArray2) { + return (((Number) jsonArray.get(1)) + .doubleValue() > ((Number) jsonArray2.get(1)) + .doubleValue() ? -1 : 1); + } + }); + + return fieldImportanceOrdered; + } + + /** + * Returns the required data distribution by adding the distributions in the + * models + */ + public JSONArray getDataDistribution(String distributionType) { + if (distributionType == null) { + distributionType = "training"; + } + + JSONObject categories = new JSONObject(); + if (distributions != null && distributions.size() > 0) { + for (Object item : distributions) { + JSONObject modelDist = (JSONObject) item; + JSONObject summary = (JSONObject) modelDist + .get(distributionType); + + JSONArray dist = new JSONArray(); + if (summary != null) { + if (summary.get("bins") != null) { + dist = (JSONArray) summary.get("bins"); + } else if (summary.get("counts") != null) { + dist = (JSONArray) summary.get("counts"); + } else if (summary.get("categories") != null) { + dist = (JSONArray) summary.get("categories"); + } + } + + for (Object distr : dist) { + JSONArray distInfo = (JSONArray) distr; + String category = (String) distInfo.get(0); + Long instances = (Long) distInfo.get(1); + + if (categories.containsKey(category)) { + Long current = (Long) categories.get(category); + categories.put(category, current + instances); + } else { + categories.put(category, instances); + } + } + + } + } + + JSONArray distribution = new JSONArray(); + for (Object cat : categories.keySet()) { + String category = (String) cat; + JSONArray item = new JSONArray(); + item.add(category); + item.add(categories.get(category)); + distribution.add(item); + } + + sortDistribution(distribution); + return distribution; + } + + /** + * Prints ensemble summary. Only field importance at present. + * + */ + public String summarize() throws IOException { + StringBuilder summarize = new StringBuilder(); + + JSONArray distribution = getDataDistribution("training"); + if (!distribution.isEmpty()) { + summarize.append("Data distribution:\n"); + summarize.append(Utils.printDistribution(distribution).toString()); + summarize.append("\n\n"); + } + + if (this.boosting == null) { + JSONArray predictions = getDataDistribution("predictions"); + if (!predictions.isEmpty()) { + summarize.append("Predicted distribution:\n"); + summarize.append( + Utils.printDistribution(distribution).toString()); + summarize.append("\n\n"); + } + } + + summarize.append("Field importance:\n"); + + distribution = new JSONArray(); + for (Object fieldItem : importance.keySet()) { + String fieldId = (String) fieldItem; + double value = (Double) importance.get(fieldId); + + JSONArray item = new JSONArray(); + item.add(fieldId); + item.add(value); + distribution.add(item); + } + + sortDistribution(distribution); + + int count = 1; + for (Object fieldItem : distribution) { + JSONArray field = (JSONArray) fieldItem; + + String fieldId = (String) field.get(0); + double value = (Double) field.get(1); + summarize.append(String.format(" %s. %s: %.2f%%\n", count++, + Utils.getJSONObject(fields, fieldId + ".name"), + (Utils.roundOff(value, 4) * 100))); + } + + return summarize.toString(); + } + + /** + * Sorting utility + * + */ + private void sortDistribution(JSONArray distribution) { + Collections.sort(distribution, new Comparator() { + @Override + public int compare(JSONArray o1, JSONArray o2) { + Object o1Val = o1.get(1); + Object o2Val = o2.get(1); + + if (o1Val instanceof Number) { + o1Val = ((Number) o1Val).doubleValue(); + o2Val = ((Number) o2Val).doubleValue(); + } + + return ((Comparable) o2Val).compareTo(o1Val); + } + }); + } + + /** + * Makes a prediction based on the prediction made by every model. + * + * @param inputData + * Input data to be predicted. + * @param method + * **deprecated**. Please check the operating_kind` attribute. + * Numeric key code for the following combination methods in + * classifications/regressions: 0 - majority vote (plurality)/ + * average: PLURALITY_CODE 1 - confidence weighted majority vote + * / error weighted: CONFIDENCE_CODE 2 - probability weighted + * majority vote / average: PROBABILITY_CODE 3 - threshold + * filtered vote / doesn't apply: THRESHOLD_CODE + * @param options Options to be used in threshold filtered votes. + * @param missingStrategy numeric key for the individual model's prediction + * method. See the model predict method. + * @param operatingPoint + * In classification models, this is the point of the ROC curve + * where the model will be used at. The operating point can be + * defined in terms of: - the positive_class, the class that is + * important to predict accurately - its kind: probability, + * confidence or voting - its threshold: the minimum established + * for the positive_class to be predicted. The operating_point is + * then defined as a map with three attributes, e.g.: + * {"positive_class": "Iris-setosa", "kind": "probability", + * "threshold": 0.5} + * @param operatingKind + * probability", "confidence" or "votes". Sets the property that + * decides the prediction. Used only if no operating_point is + * used + * @param median + * Uses the median of each individual model's predicted node as + * individual prediction for the specified combination method. + * @param full + * Boolean that controls whether to include the prediction's + * attributes. By default, only the prediction is produced. If + * set to True, the rest of available information is added in a + * dictionary format. The dictionary keys can be: - prediction: + * the prediction value - probability: prediction's probability - + * distribution: distribution of probabilities for each of the + * objective field classes - unused_fields: list of fields in the + * input data that + * + */ + public HashMap predict(JSONObject inputData, + PredictionMethod method, Map options, + MissingStrategy missingStrategy, + JSONObject operatingPoint, String operatingKind, + Boolean median, Boolean full) throws Exception { + + if (missingStrategy == null) { + missingStrategy = MissingStrategy.LAST_PREDICTION; + } + + if (median == null) { + median = false; + } + + if (full == null) { + full = false; + } + + // Checks and cleans inputData leaving the fields used in the model + inputData = filterInputData(inputData, full); + + List unusedFields = (List) inputData + .get("unusedFields"); + inputData = (JSONObject) inputData.get("newInputData"); + + // Strips affixes for numeric values and casts to the final field type + Utils.cast(inputData, fields); + + if (median && method == null) { + // predictions with median are only available with old combiners + method = PredictionMethod.PLURALITY; + } + + if (method == null && operatingPoint == null && operatingKind == null + && !median) { + // operating_point has precedence over operating_kind. If no + // combiner is set, default operating kind is "probability" + operatingKind = "probability"; + } + + // Operating Point + if (operatingPoint != null) { + if (regression) { + throw new IllegalArgumentException( + "The operatingPoint argument can only be" + + " used in classifications."); + } + + return predictOperating(inputData, missingStrategy, operatingPoint); + } + + if (operatingKind != null) { + if (regression) { + // for regressions, operating_kind defaults to the old combiners + method = "confidence".equals(operatingKind) + ? PredictionMethod.CONFIDENCE + : PredictionMethod.PLURALITY; + + return predict(inputData, method, options, missingStrategy, + null, null, null, full); + } else { + // predict operating point + return predictOperatingKind(inputData, missingStrategy, + operatingKind); + } + } + + MultiVote votes = null; + if (modelsSplit != null && modelsSplit.size() > 1) { + // If there's more than one chunk of models, they must be + // sequentially used to generate the votes for the prediction + votes = new MultiVote(); + for (Object split : modelsSplit) { + JSONArray models = (JSONArray) split; + MultiModel multiModel = new MultiModel(models, fields, null); + + MultiVote modelVotes = multiModel.generateVotes(inputData, + missingStrategy, unusedFields); + votes.extend(modelVotes); + } + } else { + // When only one group of models is found you use the + // corresponding multimodel to predict + MultiVote votesSplit = this.multiModel.generateVotes(inputData, + missingStrategy, unusedFields); + + votes = new MultiVote(votesSplit.predictions, boostingOffsets); + } + + if (this.boosting != null && !this.regression) { + options = new HashMap(); + JSONArray categories = (JSONArray) Utils.getJSONObject( + (JSONObject) fields.get(objectiveField), + "summary.categories", new JSONArray()); + options.put("categories", categories); + } + + HashMap results = votes.combine(method, options); + + HashMap prediction = new HashMap(); + for (Object key : results.keySet()) { + prediction.put((String) key, results.get(key)); + } + + if (full) { + prediction.put("unused_fields", unusedFields); + } + + return prediction; + } + + /** + * Computes the prediction based on a user-given operating point. + */ + private HashMap predictOperating(JSONObject inputData, + MissingStrategy missingStrategy, JSONObject operatingPoint) + throws Exception { + + if (missingStrategy == null) { + missingStrategy = MissingStrategy.LAST_PREDICTION; + } + + Object[] operating = Utils.parseOperatingPoint(operatingPoint, + OPERATING_POINT_KINDS, classNames); + + String kind = (String) operating[0]; + Double threshold = (Double) operating[1]; + String positiveClass = (String) operating[2]; + + if (!Arrays.asList(OPERATING_POINT_KINDS).contains(kind)) { + throw new IllegalArgumentException(String.format( + "Allowed operating kinds are %", OPERATING_POINT_KINDS)); + } + + JSONArray predictions = null; + if (kind.equals("probability")) { + predictions = predictProbability(inputData, missingStrategy); + } + if (kind.equals("confidence")) { + predictions = predictConfidence(inputData, missingStrategy); + } + if (kind.equals("votes")) { + predictions = predictVotes(inputData, missingStrategy); + } + + for (Object pred : predictions) { + HashMap prediction + = (HashMap) pred; + String category = (String) prediction.get("category"); + if (category != null) { + prediction.put("prediction", category); + prediction.remove("category"); + } + else { + category = (String) prediction.get("prediction"); + } + + if (category.equals(positiveClass) + && (Double) prediction.get(kind) > threshold) { + return prediction; + } + } + + HashMap prediction + = (HashMap) predictions.get(0); + String category = (String) prediction.get("prediction"); + if (category.equals(positiveClass)) { + prediction = (HashMap) predictions.get(1); + } + + return prediction; + } + + /** + * Computes the prediction based on a user-given operating kind, i.e, + * confidence, probability or votes. + */ + private HashMap predictOperatingKind(JSONObject inputData, + MissingStrategy missingStrategy, String operatingKind) + throws Exception { + + if (missingStrategy == null) { + missingStrategy = MissingStrategy.LAST_PREDICTION; + } + + String kind = operatingKind.toLowerCase(); + + if (boosting != null && !"probability".equals(kind)) { + throw new IllegalArgumentException( + "Only probability is allowed as operating kind " + + "for boosted ensembles."); + } + + if (!Arrays.asList(OPERATING_POINT_KINDS).contains(kind)) { + throw new IllegalArgumentException(String.format( + "Allowed operating kinds are %", OPERATING_POINT_KINDS)); + } + + JSONArray predictions = null; + if (kind.equals("probability")) { + predictions = predictProbability(inputData, missingStrategy); + } + if (kind.equals("confidence")) { + predictions = predictConfidence(inputData, missingStrategy); + } + if (kind.equals("votes")) { + predictions = predictVotes(inputData, missingStrategy); + } + + HashMap prediction + = (HashMap) predictions.get(0); + prediction.put("prediction", prediction.get("category")); + prediction.remove("category"); + + return prediction; + } + + + /** + * For classification models, Predicts a probability for each possible + * output class, based on input values. The input fields must be a + * dictionary keyed by field name or field ID. + * + * For regressions, the output is a single element list containing the + * prediction. + * + * @param inputData + * Input data to be predicted + * @param missingStrategy + * LAST_PREDICTION|PROPORTIONAL missing strategy for missing + * fields + */ + public JSONArray predictProbability(JSONObject inputData, + MissingStrategy missingStrategy) throws Exception { + + JSONArray predictions = new JSONArray(); + HashMap prediction = null; + if (regression) { + prediction = predict(inputData, PredictionMethod.PROBABILITY, null, + missingStrategy, null, null, null, true); + predictions.add(prediction); + } else { + if (boosting != null) { + prediction = predict(inputData, PredictionMethod.PLURALITY, + null, missingStrategy, null, null, null, true); + // logger.info("*** PREDICTION is " + prediction); + JSONArray probabilities = (JSONArray) prediction + .get("probabilities"); + if (probabilities != null) { + predictions.add(probabilities); + } + else { + predictions.add(prediction); + } + } else { + List output = combineDistributions(inputData, + missingStrategy, null); + + for (int i = 0; i < classNames.size(); i++) { + prediction = new JSONObject(); + prediction.put("category", (String) classNames.get(i)); + prediction.put("probability", output.get(i)); + predictions.add(prediction); + } + } + } + + Utils.sortPredictions(predictions, "probability", "category"); + return predictions; + + } + + /** + * For classification models, Predicts a confidence for each possible output + * class, based on input values. The input fields must be a dictionary keyed + * by field name or field ID. + * + * For regressions, the output is a single element list containing the + * prediction. + * + * @param inputData + * Input data to be predicted + * @param missingStrategy + * LAST_PREDICTION|PROPORTIONAL missing strategy for missing + * fields + */ + private JSONArray predictConfidence(JSONObject inputData, + MissingStrategy missingStrategy) throws Exception { + + if (boosting != null) { + // we use boosting probabilities as confidences also + return predictProbability(inputData, missingStrategy); + } + + JSONArray predictions = new JSONArray(); + HashMap prediction = null; + if (regression) { + prediction = predict(inputData, PredictionMethod.CONFIDENCE, null, + missingStrategy, null, null, null, true); + predictions.add(prediction); + } else { + List output = combineDistributions(inputData, + missingStrategy, PredictionMethod.CONFIDENCE); + + for (int i = 0; i < classNames.size(); i++) { + prediction = new JSONObject(); + prediction.put("category", (String) classNames.get(i)); + prediction.put("confidence", output.get(i)); + predictions.add(prediction); + } + } + + Utils.sortPredictions(predictions, "confidence", "category"); + return predictions; + } + + /** + * For classification models, Predicts the votes for each possible output + * class, based on input values. The input fields must be a dictionary keyed + * by field name or field ID. + * + * For regressions, the output is a single element list containing the + * prediction. + * + * @param inputData + * Input data to be predicted + * @param missingStrategy + * LAST_PREDICTION|PROPORTIONAL missing strategy for missing + * fields + */ + private JSONArray predictVotes(JSONObject inputData, + MissingStrategy missingStrategy) throws Exception { + + JSONArray predictions = new JSONArray(); + HashMap prediction = null; + if (regression) { + prediction = predict(inputData, PredictionMethod.PLURALITY, null, + missingStrategy, null, null, null, true); + predictions.add(prediction); + } else { + if (boosting != null) { + throw new IllegalArgumentException( + "Votes cannot be computed for boosted ensembles."); + } else { + List output = combineDistributions(inputData, + missingStrategy, PredictionMethod.PLURALITY); + + for (int i = 0; i < classNames.size(); i++) { + prediction = new JSONObject(); + prediction.put("category", (String) classNames.get(i)); + prediction.put("votes", output.get(i)); + predictions.add(prediction); + } + } + } + + Utils.sortPredictions(predictions, "votes", "category"); + return predictions; + } } diff --git a/src/main/java/org/bigml/binding/LocalFusion.java b/src/main/java/org/bigml/binding/LocalFusion.java index c08d614..691e94b 100644 --- a/src/main/java/org/bigml/binding/LocalFusion.java +++ b/src/main/java/org/bigml/binding/LocalFusion.java @@ -39,21 +39,21 @@ * JSONObject predictors = JSONValue.parse("{\"petal length\": 3, \"petal width\": 1}"); * * localFusion.predict(predictors) - * + * */ public class LocalFusion extends ModelFields implements SupervisedModelInterface { private static final long serialVersionUID = 1L; - + static String FUSION_RE = "^fusion/[a-f,0-9]{24}$"; - + private static final String[] OPERATING_POINT_KINDS = {"probability"}; - - private static final String[] LOCAL_SUPERVISED = { + + private static final String[] LOCAL_SUPERVISED = { "model", "ensemble", "logisticregression", "deepnet", "fusion" }; - - + + /** * Logging */ @@ -68,20 +68,20 @@ public class LocalFusion extends ModelFields implements SupervisedModelInterface private Boolean regression = false; private List classNames = new ArrayList(); private Boolean missingNumerics = true; - - public LocalFusion(JSONObject fusion) + + public LocalFusion(JSONObject fusion) throws Exception { this(fusion, null); } - - - public LocalFusion(JSONObject fusion, Integer maxModels) + + + public LocalFusion(JSONObject fusion, Integer maxModels) throws Exception { - + super((JSONObject) Utils.getJSONObject( fusion, "fusion.fields", new JSONObject())); - - // checks whether the information needed for local predictions + + // checks whether the information needed for local predictions // is in the first argument if (!checkModelFields(fusion)) { // if the fields used by the logistic regression are not @@ -93,26 +93,26 @@ public LocalFusion(JSONObject fusion, Integer maxModels) fusionId + " is not a valid resource ID."); } } - + if (!(fusion.containsKey("resource") && fusion.get("resource") != null)) { - BigMLClient client = new BigMLClient(null, null, + BigMLClient client = new BigMLClient(null, null, BigMLClient.STORAGE); fusion = client.getFusion(fusionId); - + if ((String) fusion.get("resource") == null) { throw new Exception( fusionId + " is not a valid resource ID."); } } - + if (fusion.containsKey("object") && fusion.get("object") instanceof JSONObject) { fusion = (JSONObject) fusion.get("object"); } - + fusionId = (String) fusion.get("resource"); - + if (fusion.containsKey("fusion") && fusion.get("fusion") instanceof JSONObject) { @@ -122,10 +122,10 @@ public LocalFusion(JSONObject fusion, Integer maxModels) if (status != null && status.containsKey("code") && AbstractResource.FINISHED == ((Number) status .get("code")).intValue()) { - + JSONObject fusionInfo = (JSONObject) Utils .getJSONObject(fusion, "fusion"); - + modelsIds = new JSONArray(); for (Object modelId: (JSONArray) fusion.get("models")) { String model = null; @@ -133,35 +133,35 @@ public LocalFusion(JSONObject fusion, Integer maxModels) model = (String) modelId; } else { model = (String) ((JSONObject) modelId).get("id"); - + try { weights.add(((Number) ((JSONObject) modelId).get("weight")).doubleValue()); } catch (Exception e) { weights = new ArrayList(); } } - + modelsIds.add(model); - + String type = model.split("/")[0]; if (!Arrays.asList(LOCAL_SUPERVISED).contains(type)) { throw new IllegalArgumentException( String.format("The resource %s has not an allowed supervised model type.", OPERATING_POINT_KINDS)); } } - + missingNumerics = (Boolean) Utils.getJSONObject(fusion, "missing_numerics", true); - + JSONObject fields = (JSONObject) Utils.getJSONObject( fusionInfo, "fields", new JSONObject()); - + // initialize ModelFields super.initialize((JSONObject) fields, null, null, null, true, true, true); - + objectiveField = (String) Utils.getJSONObject( fusion, "objective_field"); - + // Apply maxModels int numberOfModels = modelsIds.size(); if( maxModels != null) { @@ -176,22 +176,22 @@ public LocalFusion(JSONObject fusion, Integer maxModels) } else { modelsSplit.add(modelsIds); } - + String optype = (String) Utils.getJSONObject( fields, objectiveField + ".optype"); - + regression = "numeric".equals(optype); if (!regression) { JSONArray categories = (JSONArray) Utils.getJSONObject( - (JSONObject) fields.get(objectiveField), + (JSONObject) fields.get(objectiveField), "summary.categories", new JSONArray()); - + for (Object cat: categories) { classNames.add((String) ((JSONArray) cat).get(0)); } Collections.sort(classNames); } - + } else { throw new Exception( "The Fusion isn't finished yet"); @@ -204,58 +204,58 @@ public LocalFusion(JSONObject fusion, Integer maxModels) + "the resource:\n\n%s", fusion)); } } - + /** * Returns the resourceId */ public String getResourceId() { return fusionId; } - + /** * Returns the class names */ public List getClassNames() { return classNames; } - + /** * For classification models, Predicts a probability for * each possible output class, based on input values. The input * fields must be a dictionary keyed by field name or field ID. - * + * * For regressions, the output is a single element list * containing the prediction. - * + * * @param inputData Input data to be predicted * @param missingStrategy LAST_PREDICTION|PROPORTIONAL missing strategy * for missing fields */ public JSONArray predictProbability( - JSONObject inputData, MissingStrategy missingStrategy) + JSONObject inputData, MissingStrategy missingStrategy) throws Exception { - + if (missingStrategy == null) { missingStrategy = MissingStrategy.LAST_PREDICTION; } - + MultiVoteList votes = new MultiVoteList(null); - + if (!this.missingNumerics) { Utils.checkNoMissingNumerics(inputData, this.fields, null); } - + BigMLClient bigmlClient = new BigMLClient(); - + for (Object modelSplit: modelsSplit) { MultiVoteList votesSplit = new MultiVoteList(null); - + List models = new ArrayList(); - + for (Object modelId: (JSONArray) modelSplit) { String type = ((String) modelId).split("/")[0]; JSONObject model = null; - + if ("model".equals(type)) { model = bigmlClient.getModel((String) modelId); models.add(new LocalPredictiveModel(model)); @@ -277,7 +277,7 @@ public JSONArray predictProbability( models.add(new LocalFusion(model)); } } - + JSONArray predictions; for (SupervisedModelInterface model: models) { try { @@ -288,17 +288,17 @@ public JSONArray predictProbability( // are found continue; } - + List predictionList = new ArrayList(); for (Object pred : predictions) { JSONObject p = (JSONObject) pred; predictionList.add((Double) p.get("probability")); } - + if (!this.weights.isEmpty()) { predictionList = weight(predictionList, model.getResourceId()); } - + // we need to check that all classes in the fusion // are also in the composing model if (!this.regression && !this.classNames.equals(model.getClassNames())) { @@ -306,13 +306,13 @@ public JSONArray predictProbability( predictionList = rearrangePrediction(model.getClassNames(), this.classNames, predictionList); } catch (Exception e) {} } - + votesSplit.append(predictionList); } - + votes.extend(votesSplit); } - + JSONArray output = new JSONArray(); if (this.regression) { double totalWeight = 1; @@ -322,7 +322,7 @@ public JSONArray predictProbability( totalWeight += w; } } - + double sum = 0.0; for (Object votesPreds: votes.predictions) { List preds = (List) votesPreds; @@ -330,9 +330,9 @@ public JSONArray predictProbability( sum += p; } } - + float divisor = ((Double) (votes.predictions.size() * totalWeight)).floatValue(); - + JSONObject prediction = new JSONObject(); prediction.put("prediction", sum / divisor); output.add(prediction); @@ -345,11 +345,11 @@ public JSONArray predictProbability( output.add(prediction); } } - + return output; } - - + + /** * Weighs the prediction according to the weight associated to the * current model in the fusion. @@ -360,7 +360,7 @@ private List weight(List predictions, String modelId) { } return predictions; } - + /** * Rearranges the probabilities in a compact array when the * list of classes in the destination resource does not match the @@ -368,7 +368,7 @@ private List weight(List predictions, String modelId) { */ private List rearrangePrediction( List originClasses, List destinationClasses, List predictions) { - + List newPrediction = new ArrayList(); for (String className: destinationClasses) { int originIndex = originClasses.indexOf(className); @@ -380,19 +380,19 @@ private List rearrangePrediction( } return newPrediction; } - - + + /** * Computes the prediction based on a user-given operating point. */ private HashMap predictOperating( - JSONObject inputData, MissingStrategy missingStrategy, + JSONObject inputData, MissingStrategy missingStrategy, JSONObject operatingPoint) throws Exception { if (missingStrategy == null) { missingStrategy = MissingStrategy.LAST_PREDICTION; } - + // only probability is allowed as operating kind Object[] operating = Utils.parseOperatingPoint( operatingPoint, OPERATING_POINT_KINDS, classNames); @@ -400,41 +400,46 @@ private HashMap predictOperating( String kind = (String) operating[0]; Double threshold = (Double) operating[1]; String positiveClass = (String) operating[2]; - + if (!Arrays.asList(OPERATING_POINT_KINDS).contains(kind)) { throw new IllegalArgumentException( String.format("Allowed operating kinds are %", OPERATING_POINT_KINDS)); } - + JSONArray predictions = predictProbability( inputData, missingStrategy); - + for (Object pred: predictions) { HashMap prediction = (HashMap) pred; String category = (String) prediction.get("category"); - + if (category == null) category = (String) prediction.get("prediction"); + if (category.equals(positiveClass) && (Double) prediction.get(kind) > threshold) { return prediction; } } - - HashMap prediction + + HashMap prediction = (HashMap) predictions.get(0); String category = (String) prediction.get("category"); + if (category == null) category = (String) prediction.get("prediction"); + if (category.equals(positiveClass)) { prediction = (JSONObject) predictions.get(1); } - - prediction.put("prediction", prediction.get("category")); - prediction.remove("category"); + + if (prediction.get("category") != null) { + prediction.put("prediction", prediction.get("category")); + prediction.remove("category"); + } return prediction; } - - + + /** * Makes a prediction based on a number of field values. - * + * * @param inputData Input data to be predicted * @param missingStrategy numeric key for the individual model's * prediction method. See the model predict @@ -451,7 +456,7 @@ private HashMap predictOperating( * The operating_point is then defined as a map with * two attributes, e.g.: * {"positive_class": "Iris-setosa", - * "probability_threshold": 0.5} + * "probability_threshold": 0.5} * @param full * Boolean that controls whether to include the prediction's * attributes. By default, only the prediction is produced. If set @@ -461,13 +466,13 @@ private HashMap predictOperating( * - probability: prediction's probability * - unused_fields: list of fields in the input data that * are not being used in the model - * + * */ public HashMap predict( - JSONObject inputData, MissingStrategy missingStrategy, - JSONObject operatingPoint, Boolean full) + JSONObject inputData, MissingStrategy missingStrategy, + JSONObject operatingPoint, Boolean full) throws Exception { - + if (missingStrategy == null) { missingStrategy = MissingStrategy.LAST_PREDICTION; } @@ -475,21 +480,21 @@ public HashMap predict( if (full == null) { full = false; } - + // Checks and cleans inputData leaving the fields used in the model inputData = filterInputData(inputData, full); - - List unusedFields = (List) + + List unusedFields = (List) inputData.get("unusedFields"); inputData = (JSONObject) inputData.get("newInputData"); - + if (!this.missingNumerics) { Utils.checkNoMissingNumerics(inputData, this.fields, null); } - + // Strips affixes for numeric values and casts to the final field type Utils.cast(inputData, fields); - + // When operating_point is used, we need the probabilities // of all possible classes to decide, so se use // the `predict_probability` method @@ -499,28 +504,28 @@ public HashMap predict( "The operating_point argument can only be" + " used in classifications."); } - + HashMap prediction = predictOperating( inputData, missingStrategy, operatingPoint); return prediction; } - + JSONArray predictions = predictProbability( inputData, missingStrategy); - + if (!regression) { Utils.sortPredictions(predictions, "probability", "prediction"); } - - HashMap prediction + + HashMap prediction = (HashMap) predictions.get(0); - + // adding unused fields, if any if (full) { prediction.put("unused_fields", unusedFields); } - return prediction; + return prediction; } } diff --git a/src/main/java/org/bigml/binding/LocalLinearRegression.java b/src/main/java/org/bigml/binding/LocalLinearRegression.java index c830dff..00b4c60 100644 --- a/src/main/java/org/bigml/binding/LocalLinearRegression.java +++ b/src/main/java/org/bigml/binding/LocalLinearRegression.java @@ -3,7 +3,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.bigml.binding.utils.Utils; import org.json.simple.JSONArray; import org.json.simple.JSONObject; @@ -13,17 +12,17 @@ import java.util.List; import java.util.Map; +import org.bigml.binding.utils.Utils; import org.bigml.binding.resources.AbstractResource; - -import org.bigml.binding.LocalLinearRegression; import org.bigml.binding.laminar.MathOps; + import org.apache.commons.math3.distribution.TDistribution; /** * A local Predictive Linear Regression. * - * This module defines a Linear Regression to make predictions locally + * This module defines a Linear Regression to make predictions locally * or embedded into your application without needing to send requests to * BigML.io. * @@ -37,350 +36,358 @@ * * * import org.bigml.binding.LocalLinearRegression; - * + * * // API client * BigMLClient api = new BigMLClient(); * * JSONObject linearRegression = api. - * getLinearRegression("linearregression/5026965515526876630001b2"); + * getLinearRegression("linearregression/5026965515526876630001b2"); * LocalLinearRegression linear = - * LocalLinearRegression(linearRegression); + * LocalLinearRegression(linearRegression); * * JSONObject predictors = JSONValue.parse(" - * {\"petal length\": 3, \"petal width\": 1, - * \"sepal length\": 1, \"sepal width\": 0.5}"); + * {\"petal length\": 3, \"petal width\": 1, + * \"sepal length\": 1, \"sepal width\": 0.5}"); * * linear.predict(predictors, true) - * + * */ public class LocalLinearRegression extends ModelFields { - private static final long serialVersionUID = 1L; - - static String LINEARREGRESSION_RE = "^linearregression/[a-f,0-9]{24}$"; - - static HashMap EXPANSION_ATTRIBUTES = new HashMap(); - static { - EXPANSION_ATTRIBUTES.put("categorical", "categories"); - EXPANSION_ATTRIBUTES.put("text", "tag_clouds"); - EXPANSION_ATTRIBUTES.put("items", "items"); - } - - protected static final String[] OPTIONAL_FIELDS = { - "categorical", "text", "items", "datetime" }; - - private final String DUMMY = "dummy"; - private final String CONTRAST = "contrast"; - private final String OTHER = "other"; - - private final Double ALPHA_FACTOR = 0.975; // alpha = 0.05 - - - /** - * Logging - */ - static Logger logger = LoggerFactory - .getLogger(LocalLogisticRegression.class.getName()); - - private String linearRegressionId; - - private JSONArray inputFields = null; - private JSONObject datasetFieldTypes = null; - private String objectiveField = null; - private JSONArray objectiveFields = null; - private String weightField; - private JSONArray coefficients = null; - private Boolean bias; - private JSONObject fieldCodings; - private JSONObject stats = null; - private JSONArray invXtx = null; - private Double tcrit = null; - private Double meanSquaredError = null; - private Long numberOfParameters = null; - private Long numberOfSamples = null; - - - public LocalLinearRegression(JSONObject linear) throws Exception { - super((JSONObject) Utils.getJSONObject( - linear, "linear_regression.fields", new JSONObject())); - - // checks whether the information needed for local predictions - // is in the first argument - if (!checkModelFields(linear)) { - // if the fields used by the linear regression are not - // available, use only ID to retrieve it again - linearRegressionId = (String) linear.get("resource"); - boolean validId = linearRegressionId.matches( - LINEARREGRESSION_RE); - if (!validId) { - throw new Exception( - linearRegressionId + " is not a valid resource ID."); - } - } - - if (!(linear.containsKey("resource") - && linear.get("resource") != null)) { - BigMLClient client = new BigMLClient(null, null, - BigMLClient.STORAGE); - linear = client.getLogisticRegression(linearRegressionId); - - if ((String) linear.get("resource") == null) { - throw new Exception( - linearRegressionId + " is not a valid resource ID."); - } - } - - if (linear.containsKey("object") && - linear.get("object") instanceof JSONObject) { - linear = (JSONObject) linear.get("object"); - } - - linearRegressionId = (String) linear.get("resource"); - - // Check json structure + private static final long serialVersionUID = 1L; + + static String LINEARREGRESSION_RE = "^linearregression/[a-f,0-9]{24}$"; + + static HashMap EXPANSION_ATTRIBUTES = new HashMap(); + static { + EXPANSION_ATTRIBUTES.put("categorical", "categories"); + EXPANSION_ATTRIBUTES.put("text", "tag_clouds"); + EXPANSION_ATTRIBUTES.put("items", "items"); + } + + protected static final String[] OPTIONAL_FIELDS = { + "categorical", "text", "items", "datetime" }; + + private final String DUMMY = "dummy"; + private final String CONTRAST = "contrast"; + private final String OTHER = "other"; + + private final Double ALPHA_FACTOR = 0.975; // alpha = 0.05 + + + /** + * Logging + */ + static Logger logger = LoggerFactory + .getLogger(LocalLogisticRegression.class.getName()); + + private String linearRegressionId; + private BigMLClient bigmlClient; + private JSONArray inputFields = null; + private JSONObject datasetFieldTypes = null; + private String objectiveField = null; + private JSONArray objectiveFields = null; + private String weightField; + private JSONArray coefficients = null; + private Boolean bias; + private JSONObject fieldCodings; + private JSONObject stats = null; + private JSONArray invXtx = null; + private Double tcrit = null; + private Double meanSquaredError = null; + private Long numberOfParameters = null; + private Long numberOfSamples = null; + + + public LocalLinearRegression(JSONObject linear) throws Exception { + this(null, linear); + } + + public LocalLinearRegression(BigMLClient bigmlClient, + JSONObject linear) throws Exception { + super((JSONObject) Utils.getJSONObject( + linear, "linear_regression.fields", new JSONObject())); + + this.bigmlClient = + (bigmlClient != null) + ? bigmlClient + : new BigMLClient(null, null, BigMLClient.STORAGE); + + // checks whether the information needed for local predictions + // is in the first argument + if (!checkModelFields(linear)) { + // if the fields used by the linear regression are not + // available, use only ID to retrieve it again + linearRegressionId = (String) linear.get("resource"); + boolean validId = linearRegressionId.matches( + LINEARREGRESSION_RE); + if (!validId) { + throw new Exception( + linearRegressionId + " is not a valid resource ID."); + } + } + + if (!(linear.containsKey("resource") + && linear.get("resource") != null)) { + linear = this.bigmlClient.getLogisticRegression(linearRegressionId); + + if ((String) linear.get("resource") == null) { + throw new Exception( + linearRegressionId + " is not a valid resource ID."); + } + } + + if (linear.containsKey("object") && + linear.get("object") instanceof JSONObject) { + linear = (JSONObject) linear.get("object"); + } + + linearRegressionId = (String) linear.get("resource"); + + // Check json structure inputFields = (JSONArray) Utils.getJSONObject(linear, "input_fields"); datasetFieldTypes = (JSONObject) Utils.getJSONObject(linear, - "dataset_field_types"); + "dataset_field_types"); weightField = (String) Utils.getJSONObject(linear, "weight_field"); objectiveField = (String) Utils.getJSONObject(linear, - "objective_field"); + "objective_field"); objectiveFields = (JSONArray) Utils.getJSONObject(linear, - "objective_fields"); + "objective_fields"); if (datasetFieldTypes == null || inputFields == null - || (objectiveField == null && objectiveFields == null)) { - throw new Exception( - "Failed to find the linear regression expected " - + "JSON structure. Check your arguments."); - } - - if (linear.containsKey("linear_regression") - && linear.get("linear_regression") instanceof JSONObject) { - - JSONObject status = (JSONObject) Utils.getJSONObject(linear, - "status"); - - if (status != null && status.containsKey("code") - && AbstractResource.FINISHED == ((Number) status - .get("code")).intValue()) { - - JSONObject linearInfo = (JSONObject) Utils - .getJSONObject(linear, "linear_regression"); - - JSONObject fields = (JSONObject) Utils.getJSONObject( - linearInfo, "fields", new JSONObject()); - - if (inputFields == null) { - inputFields = new JSONArray(); - String[] inputFieldsArray = new String[fields.values().size()]; - for (Object fieldId : fields.keySet()) { - int columnNumber = ((Number) Utils.getJSONObject( - fields, fieldId + ".column_number")).intValue(); - inputFieldsArray[columnNumber] = (String) fieldId; - } - inputFields.addAll(Arrays.asList(inputFieldsArray)); - } - - coefficients = (JSONArray) Utils.getJSONObject( - linearInfo, "coefficients", new JSONArray()); - - bias = (Boolean) Utils.getJSONObject(linearInfo, "bias", true); - - // initialize ModelFields - super.initialize((JSONObject) fields, null, null, null, - true, true, true); - - Object fieldCodingsObj = (Object) Utils.getJSONObject( - linearInfo, "field_codings"); - if (fieldCodingsObj!=null && fieldCodingsObj instanceof JSONArray) { - formatFieldCodings((JSONArray) fieldCodingsObj); - } else { - fieldCodings = (JSONObject) Utils.getJSONObject( - linearInfo, "field_codings", new JSONObject()); - } - - for (Object field : fieldCodings.keySet()) { - String fieldId = (String) field; - if (!fields.containsKey(fieldId) && - this.invertedFields.containsKey(fieldId)) { - - JSONObject fieldObj = (JSONObject) fieldCodings.get(fieldId); - fieldObj.put(this.invertedFields.get(fieldId), - fieldCodings.get(fieldId)); - fieldCodings.remove(fieldId); - } - } - - this.numberOfParameters = (Long) Utils.getJSONObject( - linearInfo, "number_of_parameters"); - - stats = (JSONObject) Utils.getJSONObject( - linearInfo, "stats", new JSONObject()); - - if (stats != null && stats.containsKey("xtx_inverse")) { - this.invXtx = (JSONArray) Utils.getJSONObject(stats, "xtx_inverse"); - this.meanSquaredError = (Double) Utils.getJSONObject(stats, "mean_squared_error"); - this.numberOfSamples = (Long) Utils.getJSONObject(stats, "number_of_samples"); - - // to be used in predictions - TDistribution tdist = new TDistribution( - this.numberOfSamples - this.numberOfParameters); - - this.tcrit = tdist.inverseCumulativeProbability(ALPHA_FACTOR); - } - - } else { - throw new Exception( - "The linear regression isn't finished yet"); - } - - } else { - throw new Exception(String - .format("Cannot create the LinearRegression instance. " - + "Could not find the 'linear_regression' key in " - + "the resource:\n\n%s", linear)); - } - - } - - /** - * Returns the resourceId - */ - public String getResourceId() { - return linearRegressionId; - } - - - /** - * Changes the field codings format to the dict notation - * - */ - private void formatFieldCodings(JSONArray fieldCodingsArray) { - fieldCodings = new JSONObject(); - for (int i=0; i getTermsArray(List terms, - Map uniqueTerms, JSONObject field, String fieldId) { - - ArrayList termsArray = new ArrayList(); - - Double[] termsArrayAux = new Double[terms.size()]; - Arrays.fill(termsArrayAux, 0.0); - termsArray.addAll(Arrays.asList(termsArrayAux)); - - try { - Double frequency = (Double) uniqueTerms.get(fieldId); - int index = terms.indexOf(fieldId); - termsArray.set(index, frequency); - } catch (Exception e) { - if (uniqueTerms.get(fieldId) instanceof HashMap) { - HashMap map = (HashMap) uniqueTerms.get(fieldId); - for (Object key: map.keySet()) { - Double frequency = ((Number) map.get((String) key)).doubleValue(); - int index = terms.indexOf((String) key); - termsArray.set(index, frequency); - } - } else { - JSONObject jsonObject = (JSONObject) uniqueTerms.get(fieldId); - for (Object key: jsonObject.keySet()) { - Double frequency = ((Number) jsonObject.get((String) key)).doubleValue(); - int index = terms.indexOf((String) key); - termsArray.set(index, frequency); - } - } - } - - return termsArray; - } - - - /** - * Returns the prediction and the confidence intervals - * - * input_data: Input data to be predicted - */ - private ArrayList categoricalEncoding( - ArrayList newInputs, String fieldId, boolean compact) { - - JSONObject fieldCoding = (JSONObject) fieldCodings.get(fieldId); - JSONArray projections = (JSONArray) Utils.getJSONObject(fieldCoding, CONTRAST); - if (projections == null) { - projections = (JSONArray) Utils.getJSONObject(fieldCoding, OTHER); - } - - if (projections != null) { - JSONArray inputs = new JSONArray(); - inputs.add(newInputs); - - ArrayList> dots = MathOps.dot(projections, inputs); - for (List dot: dots) { - newInputs.add(dot.get(0)); - } - } - - if (compact && fieldCoding.get(DUMMY) != null) { - String dummyClass = (String) fieldCoding.get(DUMMY); - int index = ((List) this.categories.get(fieldId)).indexOf(dummyClass); - - ArrayList catNewInputs = new ArrayList(newInputs.subList(0, index)); - if (newInputs.size() > (index+1)) { - catNewInputs.addAll(newInputs.subList(index + 1, newInputs.size())); - } - newInputs = catNewInputs; - } - - return newInputs; - } - - - /** - * Computes the confidence interval for the prediction - */ - private HashMap confidenceBounds(ArrayList inputArray) { - HashMap confidenceBoounds = new HashMap(); - - JSONArray inputs = new JSONArray(); + || (objectiveField == null && objectiveFields == null)) { + throw new Exception( + "Failed to find the linear regression expected " + + "JSON structure. Check your arguments."); + } + + if (linear.containsKey("linear_regression") + && linear.get("linear_regression") instanceof JSONObject) { + + JSONObject status = (JSONObject) Utils.getJSONObject(linear, + "status"); + + if (status != null && status.containsKey("code") + && AbstractResource.FINISHED == ((Number) status + .get("code")).intValue()) { + + JSONObject linearInfo = (JSONObject) Utils + .getJSONObject(linear, "linear_regression"); + + JSONObject fields = (JSONObject) Utils.getJSONObject( + linearInfo, "fields", new JSONObject()); + + if (inputFields == null) { + inputFields = new JSONArray(); + String[] inputFieldsArray = new String[fields.values().size()]; + for (Object fieldId : fields.keySet()) { + int columnNumber = ((Number) Utils.getJSONObject( + fields, fieldId + ".column_number")).intValue(); + inputFieldsArray[columnNumber] = (String) fieldId; + } + inputFields.addAll(Arrays.asList(inputFieldsArray)); + } + + coefficients = (JSONArray) Utils.getJSONObject( + linearInfo, "coefficients", new JSONArray()); + + bias = (Boolean) Utils.getJSONObject(linearInfo, "bias", true); + + // initialize ModelFields + super.initialize((JSONObject) fields, null, null, null, + true, true, true); + + Object fieldCodingsObj = (Object) Utils.getJSONObject( + linearInfo, "field_codings"); + if (fieldCodingsObj!=null && fieldCodingsObj instanceof JSONArray) { + formatFieldCodings((JSONArray) fieldCodingsObj); + } else { + fieldCodings = (JSONObject) Utils.getJSONObject( + linearInfo, "field_codings", new JSONObject()); + } + + for (Object field : fieldCodings.keySet()) { + String fieldId = (String) field; + if (!fields.containsKey(fieldId) && + this.invertedFields.containsKey(fieldId)) { + + JSONObject fieldObj = (JSONObject) fieldCodings.get(fieldId); + fieldObj.put(this.invertedFields.get(fieldId), + fieldCodings.get(fieldId)); + fieldCodings.remove(fieldId); + } + } + + this.numberOfParameters = (Long) Utils.getJSONObject( + linearInfo, "number_of_parameters"); + + stats = (JSONObject) Utils.getJSONObject( + linearInfo, "stats", new JSONObject()); + + if (stats != null && stats.containsKey("xtx_inverse")) { + this.invXtx = (JSONArray) Utils.getJSONObject(stats, "xtx_inverse"); + this.meanSquaredError = (Double) Utils.getJSONObject(stats, "mean_squared_error"); + this.numberOfSamples = (Long) Utils.getJSONObject(stats, "number_of_samples"); + + // to be used in predictions + TDistribution tdist = new TDistribution( + this.numberOfSamples - this.numberOfParameters); + + this.tcrit = tdist.inverseCumulativeProbability(ALPHA_FACTOR); + } + + } else { + throw new Exception( + "The linear regression isn't finished yet"); + } + + } else { + throw new Exception(String + .format("Cannot create the LinearRegression instance. " + + "Could not find the 'linear_regression' key in " + + "the resource:\n\n%s", linear)); + } + + } + + /** + * Returns the resourceId + */ + public String getResourceId() { + return linearRegressionId; + } + + + /** + * Changes the field codings format to the dict notation + * + */ + private void formatFieldCodings(JSONArray fieldCodingsArray) { + fieldCodings = new JSONObject(); + for (int i=0; i getTermsArray(List terms, + Map uniqueTerms, JSONObject field, String fieldId) { + + ArrayList termsArray = new ArrayList(); + + Double[] termsArrayAux = new Double[terms.size()]; + Arrays.fill(termsArrayAux, 0.0); + termsArray.addAll(Arrays.asList(termsArrayAux)); + + try { + Double frequency = (Double) uniqueTerms.get(fieldId); + int index = terms.indexOf(fieldId); + termsArray.set(index, frequency); + } catch (Exception e) { + if (uniqueTerms.get(fieldId) instanceof HashMap) { + HashMap map = (HashMap) uniqueTerms.get(fieldId); + for (Object key: map.keySet()) { + Double frequency = ((Number) map.get((String) key)).doubleValue(); + int index = terms.indexOf((String) key); + termsArray.set(index, frequency); + } + } else { + JSONObject jsonObject = (JSONObject) uniqueTerms.get(fieldId); + for (Object key: jsonObject.keySet()) { + Double frequency = ((Number) jsonObject.get((String) key)).doubleValue(); + int index = terms.indexOf((String) key); + termsArray.set(index, frequency); + } + } + } + + return termsArray; + } + + + /** + * Returns the prediction and the confidence intervals + * + * input_data: Input data to be predicted + */ + private ArrayList categoricalEncoding( + ArrayList newInputs, String fieldId, boolean compact) { + + JSONObject fieldCoding = (JSONObject) fieldCodings.get(fieldId); + JSONArray projections = (JSONArray) Utils.getJSONObject(fieldCoding, CONTRAST); + if (projections == null) { + projections = (JSONArray) Utils.getJSONObject(fieldCoding, OTHER); + } + + if (projections != null) { + JSONArray inputs = new JSONArray(); + inputs.add(newInputs); + + ArrayList> dots = MathOps.dot(projections, inputs); + for (List dot: dots) { + newInputs.add(dot.get(0)); + } + } + + if (compact && fieldCoding.get(DUMMY) != null) { + String dummyClass = (String) fieldCoding.get(DUMMY); + int index = ((List) this.categories.get(fieldId)).indexOf(dummyClass); + + ArrayList catNewInputs = new ArrayList(newInputs.subList(0, index)); + if (newInputs.size() > (index+1)) { + catNewInputs.addAll(newInputs.subList(index + 1, newInputs.size())); + } + newInputs = catNewInputs; + } + + return newInputs; + } + + + /** + * Computes the confidence interval for the prediction + */ + private HashMap confidenceBounds(ArrayList inputArray) { + HashMap confidenceBoounds = new HashMap(); + + JSONArray inputs = new JSONArray(); inputs.add(inputArray); - - double product = MathOps.dot(MathOps.dot(inputs, this.invXtx), inputs).get(0).get(0); - - double confidenceInterval = 0; - double predictionInterval = 0; - - try { - if (this.meanSquaredError != 0) { - confidenceInterval = this.tcrit * Math.sqrt(this.meanSquaredError * product); - predictionInterval = this.tcrit * Math.sqrt(this.meanSquaredError * (product + 1)); - } - } catch (Exception e) {} - - confidenceBoounds.put("confidenceInterval", confidenceInterval); - confidenceBoounds.put("predictionInterval", predictionInterval); - return confidenceBoounds; - } - - - /** - * Creates an input array with the values in inputData and - * uniqueTerms and the following rules: + + double product = ((List)MathOps.dot(MathOps.dot(inputs, this.invXtx), inputs).get(0)).get(0); + + double confidenceInterval = 0; + double predictionInterval = 0; + + try { + if (this.meanSquaredError != 0) { + confidenceInterval = this.tcrit * Math.sqrt(this.meanSquaredError * product); + predictionInterval = this.tcrit * Math.sqrt(this.meanSquaredError * (product + 1)); + } + } catch (Exception e) {} + + confidenceBoounds.put("confidenceInterval", confidenceInterval); + confidenceBoounds.put("predictionInterval", predictionInterval); + return confidenceBoounds; + } + + + /** + * Creates an input array with the values in inputData and + * uniqueTerms and the following rules: * - fields are ordered as input_fields * - numeric fields contain the value or 0 if missing * - categorial fields are one-hot encoded and classes are sorted as @@ -390,140 +397,140 @@ private HashMap confidenceBounds(ArrayList inputArray) { * - text and items fields are expanded into their elements as found * in the corresponding summmary information and their values treated * as numerics. - * - */ - private ArrayList expandInput(JSONObject inputData, - Map uniqueTerms, boolean compact) { - - ArrayList inputArray = new ArrayList(); - - for (Object fieldIdent : inputFields) { - String fieldId = (String) fieldIdent; - JSONObject field = (JSONObject) fields.get(fieldId); - String optType = (String) Utils.getJSONObject(field, "optype"); - - boolean missings = false; - ArrayList newInputs = new ArrayList(); - - if ("numeric".equals(optType)) { - Double value = 0.0; - if (inputData.keySet().contains(fieldId)) { - value = ((Number) Utils.getJSONObject(inputData, fieldId, - 0)).doubleValue(); - } else { - missings = true; - value = 0.0; - } - newInputs.add(value); - } else { - List terms = null; - if ("categorical".equals(optType)) { - terms = (List) this.categories.get(fieldId); - } - if ("text".equals(optType)) { - terms = this.tagClouds.get(fieldId); - } - if ("items".equals(optType)) { - terms = this.items.get(fieldId); - } - - if (uniqueTerms.keySet().contains(fieldId)) { - newInputs = getTermsArray(terms, uniqueTerms, field, - fieldId); - } else { - Double[] newInputsAux = new Double[terms.size()]; - Arrays.fill(newInputsAux, 0.0); - newInputs.addAll(Arrays.asList(newInputsAux)); - missings = true; - } - } - - Integer missingCount = ((Number) Utils.getJSONObject( - (JSONObject) field, "summary.missing_count", 0)).intValue(); - JSONObject fieldCoding = (JSONObject) fieldCodings.get(fieldId); - - if (missingCount > 0 || - (optType.equals("categorical") && fieldCoding.get("dummy") == null )) { - newInputs.add(missings ? 1.0 : 0.0); - } - - if ("categorical".equals(optType)) { - newInputs = categoricalEncoding(newInputs, fieldId, compact); - } - - inputArray.addAll(newInputs); - } - - if (this.bias) { - inputArray.add(1.0); - } - - return inputArray; - } - - - /** - * Returns the prediction and the confidence intervals - * - * @param inputData Input data to be predicted - * @param full - * Boolean that controls whether to include the prediction's + * + */ + private ArrayList expandInput(JSONObject inputData, + Map uniqueTerms, boolean compact) { + + ArrayList inputArray = new ArrayList(); + + for (Object fieldIdent : inputFields) { + String fieldId = (String) fieldIdent; + JSONObject field = (JSONObject) fields.get(fieldId); + String optType = (String) Utils.getJSONObject(field, "optype"); + + boolean missings = false; + ArrayList newInputs = new ArrayList(); + + if ("numeric".equals(optType)) { + Double value = 0.0; + if (inputData.keySet().contains(fieldId)) { + value = ((Number) Utils.getJSONObject(inputData, fieldId, + 0)).doubleValue(); + } else { + missings = true; + value = 0.0; + } + newInputs.add(value); + } else { + List terms = null; + if ("categorical".equals(optType)) { + terms = (List) this.categories.get(fieldId); + } + if ("text".equals(optType)) { + terms = this.tagClouds.get(fieldId); + } + if ("items".equals(optType)) { + terms = this.items.get(fieldId); + } + + if (uniqueTerms.keySet().contains(fieldId)) { + newInputs = getTermsArray(terms, uniqueTerms, field, + fieldId); + } else { + Double[] newInputsAux = new Double[terms.size()]; + Arrays.fill(newInputsAux, 0.0); + newInputs.addAll(Arrays.asList(newInputsAux)); + missings = true; + } + } + + Integer missingCount = ((Number) Utils.getJSONObject( + (JSONObject) field, "summary.missing_count", 0)).intValue(); + JSONObject fieldCoding = (JSONObject) fieldCodings.get(fieldId); + + if (missingCount > 0 || + (optType.equals("categorical") && fieldCoding.get("dummy") == null )) { + newInputs.add(missings ? 1.0 : 0.0); + } + + if ("categorical".equals(optType)) { + newInputs = categoricalEncoding(newInputs, fieldId, compact); + } + + inputArray.addAll(newInputs); + } + + if (this.bias) { + inputArray.add(1.0); + } + + return inputArray; + } + + + /** + * Returns the prediction and the confidence intervals + * + * @param inputData Input data to be predicted + * @param full + * Boolean that controls whether to include the prediction's * attributes. By default, only the prediction is produced. If set * to True, the rest of available information is added in a * dictionary format. The dictionary keys can be: * - prediction: the prediction value * - unused_fields: list of fields in the input data that - * - */ - public HashMap predict( - JSONObject inputData, Boolean full) { - - if (full == null) { - full = false; - } - - // Checks and cleans inputData leaving the fields used in the model + * + */ + public HashMap predict( + JSONObject inputData, Boolean full) { + + if (full == null) { + full = false; + } + + // Checks and cleans inputData leaving the fields used in the model inputData = filterInputData(inputData, full); - - List unusedFields = (List) - inputData.get("unusedFields"); - inputData = (JSONObject) inputData.get("newInputData"); - - // Strips affixes for numeric values and casts to the final field type + + List unusedFields = (List) + inputData.get("unusedFields"); + inputData = (JSONObject) inputData.get("newInputData"); + + // Strips affixes for numeric values and casts to the final field type Utils.cast(inputData, fields); - + // In case that the training data has no missings, input data shouldn't Utils.checkNoTrainingMissings( - inputData, this.fields, this.weightField, - this.objectiveField); - + inputData, this.fields, this.weightField, + this.objectiveField); + // Computes text and categorical field expansion Map uniqueTerms = uniqueTerms(inputData); - + // Creates an input vector with the values for all expanded fields. ArrayList inputArray = expandInput(inputData, uniqueTerms, false); ArrayList compactInputArray = expandInput(inputData, uniqueTerms, true); - + JSONArray coefficientsList = new JSONArray(); coefficientsList.add(Utils.flattenList(this.coefficients)); JSONArray inputs = new JSONArray(); inputs.add(inputArray); - - ArrayList> dots = MathOps.dot(coefficientsList, inputs); - double prediction = dots.get(0).get(0); - + + ArrayList> dots = MathOps.dot(coefficientsList, inputs); + double prediction = dots.get(0).get(0); + HashMap result = new HashMap(); result.put("prediction", prediction); - + if (full) { - result.put("unused_fields", unusedFields); + result.put("unused_fields", unusedFields); } - + if (full && this.invXtx != null) { - result.put("confidence_bounds", confidenceBounds(compactInputArray)); + result.put("confidence_bounds", confidenceBounds(compactInputArray)); } - + return result; - } + } } diff --git a/src/main/java/org/bigml/binding/LocalPredictiveModel.java b/src/main/java/org/bigml/binding/LocalPredictiveModel.java index ffbd0d3..3e062eb 100755 --- a/src/main/java/org/bigml/binding/LocalPredictiveModel.java +++ b/src/main/java/org/bigml/binding/LocalPredictiveModel.java @@ -64,7 +64,7 @@ Example usage (assuming that you have previously set up the BIGML_USERNAME public class LocalPredictiveModel extends BaseModel implements PredictionConverter, SupervisedModelInterface { private static final long serialVersionUID = 1L; - + /** * Logging */ @@ -100,14 +100,14 @@ public class LocalPredictiveModel extends BaseModel implements PredictionConvert private static final String[] OPERATING_POINT_KINDS = { "probability", "confidence" }; - + private JSONObject root; private Tree tree; private BoostedTree boostedTree; private Map idsMap; private Map> terms = new HashMap>(); private int maxBins = 0; - + private Boolean regression = false; private JSONObject boosting = null; private List classNames = new ArrayList(); @@ -120,13 +120,13 @@ public class LocalPredictiveModel extends BaseModel implements PredictionConvert */ public LocalPredictiveModel(JSONObject model) throws Exception { super(model); - + try { if (model.containsKey("object") && model.get("object") instanceof JSONObject) { model = (JSONObject) model.get("object"); } - + // boosting models are to be handled using the BoostedTree // class boolean boostedEnsemble = (Boolean) Utils.getJSONObject( @@ -135,18 +135,18 @@ public LocalPredictiveModel(JSONObject model) throws Exception { this.boosting = (JSONObject) Utils.getJSONObject( model, "boosting", null); } - + String optype = (String) Utils.getJSONObject( fields, objectiveField + ".optype"); - - this.regression = + + this.regression = (!isBoosting() && "numeric".equals(optype) ) || (isBoosting() && boosting.get("objective_class") == null); - + this.root = (JSONObject) Utils.getJSONObject(model, "model.root"); this.idsMap = new HashMap(); - + if (isBoosting()) { this.boostedTree = new BoostedTree( root, this.fields, objectiveField); @@ -159,7 +159,7 @@ public LocalPredictiveModel(JSONObject model) throws Exception { treeInfo.put("max_bins", maxBins); this.tree = new Tree(root, this.fields, objectiveField, distribution, null, idsMap, true, treeInfo); - + if (this.tree.isRegression()) { this.maxBins = this.tree.getMaxBins(); } else { @@ -168,40 +168,40 @@ public LocalPredictiveModel(JSONObject model) throws Exception { classNames.add((String) ((JSONArray) dist).get(0)); } Collections.sort(classNames); - + JSONArray categories = (JSONArray) Utils.getJSONObject( - (JSONObject) fields.get(objectiveField), + (JSONObject) fields.get(objectiveField), "summary.categories", new JSONArray()); - + for (Object category: categories) { objectiveCategories.add((String) ((JSONArray) category).get(0)); } } } - + } catch (Exception e) { e.printStackTrace(); logger.error("Invalid model structure", e); throw new InvalidModelException(); } } - - + + /** * Returns the class names */ public List getClassNames() { return classNames; } - - + + /** * Correction term based on the training dataset distribution - * + * */ private HashMap laplacianTerm() { HashMap categoryMap = new HashMap(); - + JSONArray rootDist = (JSONArray) this.tree.getDistribution(); if (this.tree.getWeighted()) { for (Object dist: rootDist) { @@ -209,13 +209,13 @@ private HashMap laplacianTerm() { String cat = (String) category.get(0); categoryMap.put(cat, 0.0); } - + } else { double total = 0.0; for (Object dist: rootDist) { total += ((Number) ((JSONArray) dist).get(1)).doubleValue(); } - + for (Object dist: rootDist) { JSONArray category = (JSONArray) dist; String cat = (String) category.get(0); @@ -223,52 +223,52 @@ private HashMap laplacianTerm() { categoryMap.put(cat, value / total); } } - + return categoryMap; } - + /** * Describes and return the fields for this model. */ public JSONObject fields() { return isBoosting() ? boostedTree.listFields() : tree.listFields(); } - + /** * Sets the fields for this model. */ public void setFields(JSONObject fields) { this.fields = fields; } - + /** * Sets the classNames for this model. */ public void setClassNames(List classNames) { this.classNames = classNames; } - + /** * Checks if the tree is a regression problem */ public boolean isRegression() { return tree.isRegression(); } - + /** * Checks if the tree is a boosting problem */ public boolean isBoosting() { return this.boosting != null && this.boosting.size() > 0; } - + /** * Checks if the tree is a boosting problem */ public JSONObject getBoosting() { return this.boosting; } - + /** * Returns a list that includes all the leaves of the model. * @@ -277,7 +277,7 @@ public JSONObject getBoosting() { public List getLeaves() { return this.tree.getLeaves(null); } - + /** * Returns a list that includes all the leaves of the model. * @@ -289,7 +289,7 @@ public List getLeaves() { public List getLeaves(TreeNodeFilter filter) { return this.tree.getLeaves(filter); } - + /** * Returns a list that includes all the leaves of the model. * @@ -298,7 +298,7 @@ public List getLeaves(TreeNodeFilter filter) { public List getBoostedLeaves() { return this.boostedTree.getLeaves(); } - + /** * Returns True if the gini impurity of the node distribution * goes above the impurity threshold. @@ -313,7 +313,7 @@ public List getImpureLeaves(Double impurityThreshold) { "This method is available for non-boosting " + "categorization models only."); } - + final Double impurityThresholdToUse = (impurityThreshold == null ? DEFAULT_IMPURITY : impurityThreshold); @@ -325,8 +325,8 @@ public boolean filter(Tree node) { } }); } - - + + /** * Makes a prediction based on a number of field values. * @@ -337,7 +337,7 @@ public Prediction predict(final String args) throws InputDataParseException { return predict(args); } - + /** * Makes a prediction based on a number of field values. * @@ -345,12 +345,12 @@ public Prediction predict(final String args) */ public Prediction predict(final JSONObject args) throws Exception { - + return (Prediction) predict(args, MissingStrategy.LAST_PREDICTION, null, null, true); } - + /** - * Makes a prediction based on a number of field values using the + * Makes a prediction based on a number of field values using the * specified Missing Strategy * * The input fields must be keyed by field name. @@ -359,21 +359,21 @@ public Prediction predict(final JSONObject args, MissingStrategy strategy) throws Exception { return predict(args, strategy, null, null, true, null); } - - + + /** - * Makes a prediction based on a number of field values using a + * Makes a prediction based on a number of field values using a * Last Prediction Strategy * - * By default the input fields must be keyed by field name but you + * By default the input fields must be keyed by field name but you * can use `byName` to input them directly keyed by id. * */ @Deprecated - public Prediction predict(final String args, Boolean byName) + public Prediction predict(final String args, Boolean byName) throws InputDataParseException { - + if (byName == null) { byName = true; } @@ -387,7 +387,7 @@ public Prediction predict(final String args, Boolean byName) } /** - * Makes a prediction based on a number of field values using a + * Makes a prediction based on a number of field values using a * Last Prediction Strategy * * The input fields must be keyed by field name. @@ -399,7 +399,7 @@ public Prediction predict(final JSONObject args, Boolean byName) } /** - * Makes a prediction based on a number of field values using the + * Makes a prediction based on a number of field values using the * specified Missing Strategy * * The input fields must be keyed by field name. @@ -409,19 +409,19 @@ public Prediction predict(final JSONObject args, Boolean byName, MissingStrategy throws Exception { return predict(args, strategy, null, null, true, null); } - + /** * Makes a multiple predictions based on a number of field values using the Last Prediction strategy * * The input fields must be keyed by field name. - * + * * @deprecated */ public List predict(final JSONObject args, Boolean byName, Object multiple) throws InputDataParseException { return predict(args, byName, MissingStrategy.LAST_PREDICTION, multiple); } - + /** * Makes a multiple predictions based on a number of field values using the Last Prediction strategy * @@ -431,7 +431,7 @@ public List predict(final JSONObject args, Object multiple) throws InputDataParseException { return predict(args, MissingStrategy.LAST_PREDICTION, multiple); } - + /** * Convenience version of predict that take as inputs a map from field ids * or names to their values as Java objects. See also predict(String, @@ -446,7 +446,7 @@ public Prediction predictWithMap( .toJSONString(inputs)); return predict(inputObj, MissingStrategy.LAST_PREDICTION, null, null, true); } - + @Deprecated public Prediction predictWithMap( final Map inputs, Boolean byName, MissingStrategy missingStrategy) @@ -456,7 +456,7 @@ public Prediction predictWithMap( .toJSONString(inputs)); return predict(inputObj, missingStrategy, null, null, true, null); } - + public Prediction predictWithMap( final Map inputs, MissingStrategy missingStrategy) throws Exception { @@ -465,24 +465,24 @@ public Prediction predictWithMap( .toJSONString(inputs)); return predict(inputObj, missingStrategy, null, null, true, null); } - + @Deprecated public Prediction predictWithMap( final Map inputs, Boolean byName) throws Exception { - + return predictWithMap(inputs, byName, MissingStrategy.LAST_PREDICTION); } - + public Prediction predictWithMap( final Map inputs) throws Exception { - + JSONObject inputObj = (JSONObject) JSONValue.parse(JSONValue .toJSONString(inputs)); return predict(inputObj, MissingStrategy.LAST_PREDICTION, null, null, true); } - - + + /** * Makes a prediction based on a number of field values. * @@ -519,7 +519,7 @@ public List predict(final JSONObject args, Boolean byName, MissingSt throws InputDataParseException { return predict(args, strategy, multiple); } - + /** * Makes a prediction based on a number of field values. @@ -630,18 +630,18 @@ public List predict(final JSONObject args, MissingStrategy strategy, return outputs; } } - + public Prediction predict( - JSONObject inputData, MissingStrategy missingStrategy, - JSONObject operatingPoint, String operatingKind, Boolean full) + JSONObject inputData, MissingStrategy missingStrategy, + JSONObject operatingPoint, String operatingKind, Boolean full) throws Exception { - return predict(inputData, missingStrategy, operatingPoint, + return predict(inputData, missingStrategy, operatingPoint, operatingKind, full, null); } - + /** * Makes a prediction based on a number of field values. - * + * * @param inputData Input data to be predicted * @param missingStrategy LAST_PREDICTION|PROPORTIONAL missing strategy for * missing fields @@ -661,11 +661,11 @@ public Prediction predict( * or * {"positive_class": "Iris-setosa", * "confidence_threshold": 0.5} - * @param operatingKind - * "probability" or "confidence". Sets the property that - * decides the prediction. Used only if no operating_point + * @param operatingKind + * "probability" or "confidence". Sets the property that + * decides the prediction. Used only if no operating_point * is used - * + * * @param full * Boolean that controls whether to include the prediction's * attributes. By default, only the prediction is produced. If set @@ -683,34 +683,34 @@ public Prediction predict( * - max: maximum value of the training instances in the * predicted node * - median: median of the values of the training instances - * in the predicted node + * in the predicted node * - unused_fields: list of fields in the input data that * are not being used in the model */ public Prediction predict( - JSONObject inputData, MissingStrategy missingStrategy, - JSONObject operatingPoint, String operatingKind, Boolean full, + JSONObject inputData, MissingStrategy missingStrategy, + JSONObject operatingPoint, String operatingKind, Boolean full, List unusedFields) throws Exception { - + if (missingStrategy == null) { missingStrategy = MissingStrategy.LAST_PREDICTION; } - + if (full == null) { full = false; } - + // Checks and cleans inputData leaving the fields used in the model inputData = filterInputData(inputData, full); - + if (unusedFields == null) { unusedFields = (List) inputData.get("unusedFields"); } inputData = (JSONObject) inputData.get("newInputData"); - + // Strips affixes for numeric values and casts to the final field type Utils.cast(inputData, fields); - + // When operating_point is used, we need the probabilities // (or confidences) of all possible classes to decide, so se use // the `predict_probability` or `predict_confidence` methods @@ -720,135 +720,137 @@ public Prediction predict( "The operating_point argument can only be" + " used in classifications."); } - + return predictOperating(inputData, missingStrategy, operatingPoint); } - + if (operatingKind != null) { if (regression) { throw new IllegalArgumentException( "The operating_kind argument can only be" + " used in classifications."); } - + return predictOperatingKind(inputData, missingStrategy, operatingKind); } - + Prediction prediction = isBoosting() ? - this.boostedTree.predict(inputData, null, missingStrategy) : + this.boostedTree.predict(inputData, null, missingStrategy) : this.tree.predict(inputData, null, missingStrategy); - + if (isBoosting() && missingStrategy == MissingStrategy.PROPORTIONAL) { // output has to be recomputed and comes in a different format - + HashMap pred = (HashMap) prediction.get("prediction"); - + Double gSum = (Double) pred.get("g_sum"); Double hSum = (Double) pred.get("h_sum"); Long population = ((Number) prediction.get("count")).longValue(); List path = (List) prediction.get("path"); - + Long lambda = (Long) this.boosting.get("lambda"); - + prediction = new Prediction( (- gSum / (hSum + lambda)), population, path, null); } - + // next List children = (List) prediction.get("children"); - String field = (children == null || children.size() == 0 ? + String field = (children == null || children.size() == 0 ? null : ((AbstractTree) children.get(0)).getPredicate().getField()); if( field != null && fields.containsKey(field) ) { field = fieldsNameById.get(field); } prediction.setNext(field); prediction.remove("children"); - + if (!isBoosting() && !isRegression()) { String pred = (String) prediction.get("prediction"); HashMap probabilities = probabilities( (JSONArray) prediction.get("distribution")); prediction.put("probability", probabilities.get(pred)); } - + if (full) { prediction.put("unused_fields", unusedFields); } - + return prediction; } - - + + /** * Computes the probability of a distribution using a Laplacian correction */ private HashMap probabilities(JSONArray distribution) { HashMap categoryMap = laplacianTerm(); double total = this.tree.getWeighted() ? 0 : 1; - for (Object item : distribution) { - JSONArray distInfo = (JSONArray) item; - String cat = (String) distInfo.get(0); - Double value = ((Number) distInfo.get(1)).doubleValue(); - - categoryMap.put(cat, categoryMap.get(cat) + value); - total += value; - } - - for (String key : categoryMap.keySet()) { + if (distribution != null) { + for (Object item : distribution) { + JSONArray distInfo = (JSONArray) item; + String cat = (String) distInfo.get(0); + Double value = ((Number) distInfo.get(1)).doubleValue(); + + categoryMap.put(cat, categoryMap.get(cat) + value); + total += value; + } + + for (String key : categoryMap.keySet()) { categoryMap.put(key, categoryMap.get(key) / total); - } + } + } return categoryMap; } - - + + /** - * + * */ private JSONArray toOutput(HashMap categoryMap, String key) { JSONArray output = new JSONArray(); - + for (String name: classNames) { Prediction element = new Prediction(); element.put("category", name); element.put(key, Utils.roundOff(categoryMap.get(name), Constants.PRECISION)); output.add(element); } - + return output; } - - + + /** * For classification models, Predicts a probability for * each possible output class, based on input values. The input * fields must be a dictionary keyed by field name or field ID. - * + * * For regressions, the output is a single element list * containing the prediction. - * + * * @param inputData Input data to be predicted * @param missingStrategy LAST_PREDICTION|PROPORTIONAL missing strategy * for missing fields */ public JSONArray predictProbability( - JSONObject inputData, MissingStrategy missingStrategy) + JSONObject inputData, MissingStrategy missingStrategy) throws Exception { JSONArray output = new JSONArray(); - + Prediction prediction = null; if (isBoosting() || isRegression()) { - prediction = predict(inputData, missingStrategy, + prediction = predict(inputData, missingStrategy, null, null, true); output.add(prediction); } else { - prediction = predict(inputData, missingStrategy, + prediction = predict(inputData, missingStrategy, null, null, true); HashMap categoryMap = probabilities( (JSONArray) prediction.get("distribution")); output = toOutput(categoryMap, "probability"); } - + return output; } @@ -856,7 +858,7 @@ public JSONArray predictProbability( * For classification models, Predicts a confidence for * each possible output class, based on input values. The input * fields must be a dictionary keyed by field name or field ID. - * + * * For regressions, the output is a single element list * containing the prediction. * @@ -865,14 +867,14 @@ public JSONArray predictProbability( * for missing fields */ public JSONArray predictConfidence( - JSONObject inputData, MissingStrategy missingStrategy) + JSONObject inputData, MissingStrategy missingStrategy) throws Exception { - + JSONArray output = new JSONArray(); - + Prediction prediction = null; if (isRegression()) { - prediction = predict(inputData, missingStrategy, + prediction = predict(inputData, missingStrategy, null, null, true); output.add(prediction); } else { @@ -882,51 +884,51 @@ public JSONArray predictConfidence( " models only."); } } - + HashMap categoryMap = new HashMap(); JSONArray distribution = tree.getDistribution(); for (Object item : distribution) { JSONArray distInfo = (JSONArray) item; categoryMap.put((String) distInfo.get(0), 0.0); } - - prediction = predict(inputData, missingStrategy, + + prediction = predict(inputData, missingStrategy, null, null, true); distribution = (JSONArray) prediction.get("distribution"); - + for (Object item : distribution) { JSONArray distInfo = (JSONArray) item; String name = (String) distInfo.get(0); categoryMap.put(name, Tree.wsConfidence(name, distribution)); } - + return toOutput(categoryMap, "confidence"); } - + /** * Computes the prediction based on a user-given operating point. */ private Prediction predictOperating( - JSONObject inputData, MissingStrategy missingStrategy, + JSONObject inputData, MissingStrategy missingStrategy, JSONObject operatingPoint) throws Exception { - + if (missingStrategy == null) { missingStrategy = MissingStrategy.LAST_PREDICTION; } - + Object[] operating = Utils.parseOperatingPoint( operatingPoint, OPERATING_POINT_KINDS, classNames); String kind = (String) operating[0]; Double threshold = (Double) operating[1]; String positiveClass = (String) operating[2]; - - JSONArray predictions = null; + + JSONArray predictions = null; if (kind.equals("probability")) { predictions = predictProbability(inputData, missingStrategy); } else { predictions = predictConfidence(inputData, missingStrategy); } - + for (Object pred: predictions) { Prediction prediction = (Prediction) pred; String category = (String) prediction.get("category"); @@ -937,7 +939,7 @@ private Prediction predictOperating( return prediction; } } - + Prediction prediction = (Prediction) predictions.get(0); String category = (String) prediction.get("category"); if (category.equals(positiveClass)) { @@ -945,46 +947,46 @@ private Prediction predictOperating( } prediction.put("prediction", prediction.get("category")); prediction.remove("category"); - + return prediction; } - - + + /** * Computes the prediction based on a user-given operating kind. */ private Prediction predictOperatingKind( - JSONObject inputData, MissingStrategy missingStrategy, + JSONObject inputData, MissingStrategy missingStrategy, String operatingKind) throws Exception { - + if (missingStrategy == null) { missingStrategy = MissingStrategy.LAST_PREDICTION; } - + String kind = operatingKind.toLowerCase(); if (!Arrays.asList(OPERATING_POINT_KINDS).contains(kind)) { throw new IllegalArgumentException( String.format("Allowed operating kinds are %", OPERATING_POINT_KINDS)); } - - JSONArray predictions = null; + + JSONArray predictions = null; if (kind.equals("probability")) { predictions = predictProbability(inputData, missingStrategy); } else { predictions = predictConfidence(inputData, missingStrategy); } - + sortPredictions(predictions, kind); - + Prediction prediction = (Prediction) predictions.get(0); prediction.put("prediction", prediction.get("category")); prediction.remove("category"); - - return prediction; + + return prediction; } - - - + + + /** * Builds the list of ids that go from a given id to the tree root */ @@ -1020,7 +1022,7 @@ public String rules() { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + return tree.rules(Predicate.RuleLanguage.PSEUDOCODE); } @@ -1032,7 +1034,7 @@ public String rules(Predicate.RuleLanguage language) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + return tree.rules(language); } @@ -1044,11 +1046,11 @@ public String rules(Predicate.RuleLanguage language, final String filterId, bool throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + List idsPath = getIdsPath(filterId); return tree.rules(language, idsPath, subtree); } - + /** * Given a prediction string, returns its value in the required type * @@ -1059,7 +1061,7 @@ public Object toPrediction(String valueAsString, Locale locale) { locale = (locale != null ? locale : BigMLClient.DEFAUL_LOCALE); String objectiveFieldName = isBoosting() ? - boostedTree.getObjectiveField() : + boostedTree.getObjectiveField() : tree.getObjectiveField(); if( "numeric".equals(Utils.getJSONObject(fields, objectiveFieldName + ".optype")) ) { String dataTypeStr = (String) Utils.getJSONObject(fields, objectiveFieldName + ".'datatype'"); @@ -1140,7 +1142,7 @@ private List getTreeArray(boolean leavesOnly) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + List headerNames = new ArrayList(); // Adding the objective field name @@ -1171,14 +1173,14 @@ private List getTreeArray(boolean leavesOnly) { /** * Outputs the node structure to in CSV file, including the */ - public void exportTreeCSV(String outputFilePath, boolean leavesOnly) + public void exportTreeCSV(String outputFilePath, boolean leavesOnly) throws IOException { - + if (isBoosting()) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + List rows = getTreeArray(leavesOnly); Writer treeFile = null; @@ -1207,7 +1209,7 @@ public void exportTreeCSV(String outputFilePath, boolean leavesOnly) } } - + /** * Groups in categories or bins the predicted data * @@ -1224,12 +1226,12 @@ public void exportTreeCSV(String outputFilePath, boolean leavesOnly) * - impurity */ public Map getGroupPrediction() { - + if (isBoosting()) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + Map groups = new HashMap(); JSONArray distribution = tree.getDistribution(); @@ -1275,7 +1277,7 @@ private void addToGroups(Map groups, List pa * * Used by getGroupPrediction() */ - private long getDepthFirstSearch(Map groups, + private long getDepthFirstSearch(Map groups, Tree tree, List path) { if( path == null ) { path= new ArrayList(); @@ -1321,12 +1323,12 @@ private long getDepthFirstSearch(Map groups, * Returns training data distribution */ public JSONArray getDataDistribution() { - + if (isBoosting()) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + JSONArray distribution = new JSONArray(); distribution.addAll(tree.getDistribution()); @@ -1352,12 +1354,12 @@ public int compare(JSONArray o1, JSONArray o2) { * Returns model predicted distribution */ public JSONArray getPredictionDistribution() { - + if (isBoosting()) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + return getPredictionDistribution(null); } @@ -1365,12 +1367,12 @@ public JSONArray getPredictionDistribution() { * Returns model predicted distribution */ public JSONArray getPredictionDistribution(Map groups) { - + if (isBoosting()) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + if( groups == null ) { groups = getGroupPrediction(); } @@ -1411,12 +1413,12 @@ public int compare(JSONArray o1, JSONArray o2) { * */ public String summarize(Boolean addFieldImportance) throws IOException { - + if (isBoosting()) { throw new IllegalArgumentException( "This method is not available for boosting models. "); } - + StringBuilder summarize = new StringBuilder(); if( addFieldImportance == null ) { @@ -1596,7 +1598,7 @@ private String confidenceError(Object value, Double impurity) { // TODO: tableau - + private enum DataTypeEnum { DOUBLE, FLOAT, INTEGER, INT8, INT16, INT32, INT64, @@ -1690,12 +1692,12 @@ public void setImpurity(double impurity) { this.impurity = impurity; } } - - + + /** * Sorts the categories in the predicted node according to the * given criteria - * + * */ private void sortPredictions(JSONArray predictions, final String property) { Collections.sort(predictions, new Comparator() { @@ -1703,12 +1705,12 @@ private void sortPredictions(JSONArray predictions, final String property) { public int compare(Prediction o1, Prediction o2) { Double o1p = (Double) o1.get(property); Double o2p = (Double) o2.get(property); - + if (o1p.doubleValue() == o2p.doubleValue()) { return ((String) o1.get("category")). compareTo(((String) o2.get("category"))); } - + return o2p.compareTo(o1p); } }); diff --git a/src/main/java/org/bigml/binding/MultiVote.java b/src/main/java/org/bigml/binding/MultiVote.java index 58f7268..4079684 100755 --- a/src/main/java/org/bigml/binding/MultiVote.java +++ b/src/main/java/org/bigml/binding/MultiVote.java @@ -20,43 +20,43 @@ * Uses a number of predictions to generate a combined prediction. */ public class MultiVote implements Serializable { - + private static final long serialVersionUID = 1L; - + /** * Logging */ static Logger LOGGER = LoggerFactory.getLogger(MultiVote.class.getName()); - - public final static String[] PREDICTION_HEADERS = new String[] { + + public final static String[] PREDICTION_HEADERS = new String[] { "prediction", "confidence", "order", "distribution", "count" }; - + private final static String[] COMBINATION_WEIGHTS = new String[] { null , "confidence", "probability", null, "weight" }; - - + + private final static String[][] WEIGHT_KEYS = new String[][] { {}, { "confidence" }, { "distribution", "count" }, {}, { "weight" } }; - + private final static String[] WEIGHT_LABELS = new String[] { "plurality", "confidence", "probability", "threshold" }; final static int BINS_LIMIT = 32; - + final static String BOOSTING_CLASS = "class"; - + public HashMap[] predictions; public boolean boosting = false; public JSONArray boostingOffsets; - - + + /** * MultiVote: combiner class for ensembles voting predictions. */ public MultiVote() { this(null, null); } - + /** * MultiVote: combiner class for ensembles voting predictions. * @@ -69,10 +69,10 @@ public MultiVote(HashMap[] predictionsArr, JSONArray boostingOff predictionsArr = new HashMap[0]; } predictions = predictionsArr; - + boosting = boostingOffsets != null && !boostingOffsets.isEmpty(); this.boostingOffsets = boostingOffsets; - + boolean allOrdered = true; for (i = 0, len = predictions.length; i < len; i++) { if (!predictions[i].containsKey("order")) { @@ -86,11 +86,11 @@ public MultiVote(HashMap[] predictionsArr, JSONArray boostingOff } } } - + public HashMap[] getPredictions() { return predictions; } - + /** * Check if this is a regression model * @@ -99,7 +99,7 @@ public HashMap[] getPredictions() { private boolean isRegression() { int index, len; HashMap prediction; - + if (boosting) { for (index = 0, len = this.predictions.length; index < len; index++) { prediction = this.predictions[index]; @@ -109,7 +109,7 @@ private boolean isRegression() { } return false; } - + for (index = 0, len = this.predictions.length; index < len; index++) { prediction = this.predictions[index]; if (!(prediction.get("prediction") instanceof Number)) { @@ -118,7 +118,7 @@ private boolean isRegression() { } return true; }; - + /** * Return the next order to be assigned to a prediction * @@ -136,8 +136,8 @@ private int nextOrder() { return 0; } - - + + /** * Adds a new prediction into a list of predictions * @@ -175,7 +175,7 @@ public MultiVote append(HashMap predictionInfo) { return this; } - + /** * Adds a new prediction into a list of predictions * @@ -241,7 +241,7 @@ public MultiVote appendRow(List predictionRow, return this; } - + /** * Given a multi vote instance (a list of predictions), extends the list * with another list of predictions and adds the order information. @@ -271,7 +271,7 @@ public void extend(MultiVote votes) { predictions = (HashMap[]) predictionsList.toArray( new HashMap[predictionsList.size()] ); } } - + /** * Given a list of predictions, extends the list with another list of * predictions and adds the order information. For instance, @@ -302,7 +302,7 @@ public MultiVote extend(List> predictionsInfo) { } return this; } - + /** * Given a list of predictions, extends the list with another list of * predictions and adds the order information. For instance, @@ -346,8 +346,8 @@ public MultiVote extendRows(List> predictionsRows, return this; } - - + + /** * Singles out the votes for a chosen category and returns a prediction * for this category iff the number of votes reaches at least the given @@ -397,8 +397,8 @@ protected MultiVote singleOutCategory(Integer threshold, String category) { new HashMap[categoryPredictions.size()]), null); } } - - + + /** * Checks the presence of each of the keys in each of the predictions * @@ -422,8 +422,8 @@ private static boolean checkKeys(HashMap[] predictions, } return true; } - - + + /** * Normalizes error to a [0, top_range] range and builds probabilities * @@ -472,8 +472,8 @@ public Double normalizeError(Double topRange) { } return normalizeFactor; }; - - + + /** * Wilson score interval computation of the distribution for the prediction * @@ -489,7 +489,7 @@ public Double normalizeError(Double topRange) { */ protected static double wsConfidence(Object prediction, HashMap distribution, Integer n, Double z) { - + double norm, z2, n2, wsSqrt, p = distribution.get(prediction) .doubleValue(), zDefault = 1.96d; if (z == null) { @@ -520,11 +520,11 @@ protected static double wsConfidence(Object prediction, z2 = z * z; n2 = n * n; wsSqrt = Math.sqrt((p * (1 - p) / n) + (z2 / (4 * n2))); - + return Utils.roundOff((p + (z2 / (2 * n)) - (z * wsSqrt)) / (1 + (z2 / n)), Constants.PRECISION); } - - + + /** * Average for regression models' predictions * @@ -539,12 +539,12 @@ private HashMap avg() { for (i = 0, len = this.predictions.length; i < len; i++) { result += ((Number) this.predictions[i].get("prediction")) .doubleValue(); - + if (this.predictions[i].containsKey("median")) { medianResult += ((Number) this.predictions[i].get("median")) .doubleValue(); } - + confidence += ((Number) this.predictions[i].get("confidence")) .doubleValue(); @@ -566,7 +566,7 @@ private HashMap avg() { average.put("count", instances); return average; } - + /** * Returns the prediction combining votes using error to compute weight * @@ -593,12 +593,12 @@ public HashMap errorWeighted() { result += ((Number) prediction.get("prediction")).doubleValue() * ((Number) prediction.get("errorWeight")).doubleValue(); - + if (prediction.get("median") != null) { medianResult += ((Number) prediction.get("median")).doubleValue() * ((Number) prediction.get("errorWeight")).doubleValue(); } - + instances += ((Number) prediction.get("count")).longValue(); combinedError += ((Number) prediction.get("confidence")) @@ -617,8 +617,8 @@ public HashMap errorWeighted() { return newPrediction; }; - - + + /** * Average for regression models' predictions * @@ -626,43 +626,43 @@ public HashMap errorWeighted() { private Double weightedSum(HashMap[] predictions, String key) { Map prediction = new HashMap(); double weightedSum = 0; - + int index, len; for (index = 0, len = predictions.length; index < len; index++) { prediction = predictions[index]; Double pred = (Double) prediction.get("prediction"); Double weight = (Double) prediction.get(key); - + weightedSum += pred * weight; } - + return weightedSum; } - - + + /** - * Returns the softmax values from a distribution given as a + * Returns the softmax values from a distribution given as a * dictionary like: * {"category": {"probability": probability, "order": order}} */ private HashMap softmax(HashMap predictions) { double total = 0; - + HashMap normalized = new HashMap(); for (Map.Entry entry : predictions.entrySet()) { String key = (String) entry.getKey(); HashMap catInfo = (HashMap) entry.getValue(); - + Double probability = Math.exp((Double) catInfo.get("probability")); - + HashMap pred = new HashMap(); pred.put("probability", probability); pred.put("order", (Integer) catInfo.get("order")); normalized.put(key, pred); - + total += probability; } - + if (total != 0) { for (Map.Entry entry : normalized.entrySet()) { String key = (String) entry.getKey(); @@ -671,11 +671,11 @@ private HashMap softmax(HashMap predictions) { } return normalized; } - + return new HashMap(); } - - + + /** * Combines the predictions for a boosted classification ensemble * Applies the regression boosting combiner, but per class. Tie breaks @@ -683,7 +683,7 @@ private HashMap softmax(HashMap predictions) { */ private HashMap classifictionBoostingCombiner(Map options) { HashMap prediction = new HashMap(); - + int index, len; Map groupedPredictions = new HashMap(); for (index = 0, len = this.predictions.length; index < len; index++) { @@ -697,7 +697,7 @@ private HashMap classifictionBoostingCombiner(Map options) { ((List>) groupedPredictions.get(objectiveClass)).add(prediction); } } - + List categories = new ArrayList(); for (Object cats: (JSONArray) options.get("categories")) { JSONArray cat = (JSONArray) cats; @@ -705,10 +705,11 @@ private HashMap classifictionBoostingCombiner(Map options) { } HashMap predictions = new HashMap(); + for (Map.Entry entry : groupedPredictions.entrySet()) { String key = entry.getKey(); ArrayList value = (ArrayList) entry.getValue(); - + Double boostingOffset = null; for (Object bOffset: (JSONArray) boostingOffsets) { JSONArray offset = (JSONArray) bOffset; @@ -717,31 +718,30 @@ private HashMap classifictionBoostingCombiner(Map options) { break; } } - + HashMap[] preds = new HashMap[value.size()]; for (index = 0, len = preds.length; index < len; index++) { preds[index] = (HashMap) value.get(index); } - + HashMap pred = new HashMap(); - pred.put("probability", + pred.put("probability", weightedSum(preds, "weight") + boostingOffset); pred.put("order", categories.indexOf(key)); predictions.put(key, pred); } - + predictions = softmax(predictions); - String predictionName = (String) predictions.keySet().toArray()[0]; HashMap predictionInfo = (HashMap) predictions.get(predictionName); - + for (Map.Entry entry : predictions.entrySet()) { String key = (String) entry.getKey(); HashMap predInfo = (HashMap) entry.getValue(); - + Double predProbability = (Double) predInfo.get("probability"); Double predictionProbability = (Double) predictionInfo.get("probability"); - + if (predProbability > predictionProbability) { predictionName = key; predictionInfo = predInfo; @@ -758,11 +758,11 @@ private HashMap classifictionBoostingCombiner(Map options) { prediction.put("prediction", predictionName); prediction.put("probability", Utils.roundOff( (Double) predictionInfo.get("probability"), Constants.PRECISION)); - + return prediction; } - - + + /** * Creates a new predictions array based on the training data probability */ @@ -793,18 +793,18 @@ public HashMap[] probabilityWeight() { } order = (Integer) prediction.get("order"); - + HashMap distribution = (HashMap) prediction.get("distribution"); for (Object key : distribution.keySet()) { - Map newPred = new HashMap(); + Map newPred = new HashMap(); newPred.put("prediction", key); newPred.put("probability", ((Integer) distribution.get(key) / (double) total)); newPred.put("count", distribution.get(key)); newPred.put("order", order); - + predictionsList.add(newPred); } - + } HashMap[] predictions = new HashMap[predictionsList.size()]; for (index = 0, len = predictions.length; index < len; index++) { @@ -813,8 +813,8 @@ public HashMap[] probabilityWeight() { } return predictions; }; - - + + /** * Builds a distribution based on the predictions of the MultiVote * @@ -856,8 +856,8 @@ public Object[] combineDistribution(String weightLabel) { combinedDistribution[1] = total; return combinedDistribution; } - - + + /** * Returns the prediction combining votes by using the given weight * @@ -870,14 +870,14 @@ public Object[] combineDistribution(String weightLabel) { * the confidences of the votes. */ public HashMap combineCategorical(String weightLabel) { - + int index, len; double weight = 1.0; Object category; HashMap prediction = new HashMap(); HashMap mode = new HashMap(); ArrayList tuples = new ArrayList(); - + for (index = 0, len = this.predictions.length; index < len; index++) { prediction = this.predictions[index]; @@ -910,7 +910,7 @@ public HashMap combineCategorical(String weightLabel) { mode.put(category, categoryHash); } - + for (Object key : mode.keySet()) { if (mode.get(key) != null) { Object[] tuple = new Object[] { key, mode.get(key) }; @@ -919,13 +919,13 @@ public HashMap combineCategorical(String weightLabel) { } Collections.sort(tuples, new TupleComparator()); - + Object[] tuple = (Object[]) tuples.get(0); Object predictionName = (Object) tuple[0]; HashMap output = new HashMap(); output.put("prediction", predictionName); - + if (this.predictions[0].get("confidence") != null) { return this.weightedConfidence(predictionName, weightLabel); } @@ -1006,8 +1006,8 @@ public HashMap weightedConfidence( return result; } - - + + /** * Returns a distribution formed by grouping the distributions of each predicted node. */ @@ -1018,7 +1018,7 @@ protected static Map getGroupedDistribution(MultiVote multiVoteI for (HashMap prediction : multiVoteInstance.getPredictions()) { HashMap predictionDist = null; Object distribution = prediction.get("distribution"); - + if( distribution instanceof Map ) { predictionDist = (HashMap) distribution; } else { @@ -1040,8 +1040,8 @@ protected static Map getGroupedDistribution(MultiVote multiVoteI return distributionInfo; } - - + + /** * Reduces a number of predictions voting for classification and averaging * predictions for regression using the PLURALITY method and without confidence @@ -1051,42 +1051,42 @@ protected static Map getGroupedDistribution(MultiVote multiVoteI public HashMap combine() { return combine((PredictionMethod) null, null); } - - + + /** * Reduces a number of predictions voting for classification and * averaging predictions for regression. */ public HashMap combine(PredictionMethod method, Map options) { - + if (method == null) { method = PredictionMethod.PLURALITY; } - + // there must be at least one prediction to be combined if (this.predictions.length == 0) { throw new Error("No predictions to be combined."); } - + // and all predictions should have the weight-related keys String[] keys = WEIGHT_KEYS[method.getCode()]; if (keys.length > 0) { checkKeys(this.predictions, keys); } - + if (this.boosting) { for (HashMap prediction : predictions) { if( !prediction.containsKey("boosting") ) { prediction.put("boosting", 0.0); } } - + if (this.isRegression()) { // sum all gradients weighted by their "weight" plus the // boosting offset HashMap prediction = new HashMap(); - prediction.put("prediction", - weightedSum(predictions, "weight") + + prediction.put("prediction", + weightedSum(predictions, "weight") + (Double) this.boostingOffsets.get(0)); return prediction; } else { @@ -1099,14 +1099,14 @@ public HashMap combine(PredictionMethod method, Map options) { prediction.put("confidence", 0.0); } } - + if (method == PredictionMethod.CONFIDENCE) { return this.errorWeighted(); } return this.avg(); } } - + MultiVote multiVote = null; if (method == PredictionMethod.THRESHOLD) { Integer threshold = (Integer) options.get("threshold"); @@ -1118,11 +1118,11 @@ public HashMap combine(PredictionMethod method, Map options) { } else { multiVote = this; } - + return multiVote.combineCategorical(COMBINATION_WEIGHTS[method.getCode()]); } - - + + /** * Comparator */ diff --git a/src/main/java/org/bigml/binding/localmodel/Predicate.java b/src/main/java/org/bigml/binding/localmodel/Predicate.java index 30132bd..163e8db 100755 --- a/src/main/java/org/bigml/binding/localmodel/Predicate.java +++ b/src/main/java/org/bigml/binding/localmodel/Predicate.java @@ -10,12 +10,19 @@ import org.json.simple.JSONArray; import org.json.simple.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + /** * A predicate to be evaluated in a tree's node. * */ public class Predicate { + static Logger LOGGER = LoggerFactory.getLogger( + Predicate.class.getName()); + private String opType; private String operator; private String field; @@ -316,6 +323,7 @@ public boolean apply(JSONObject inputData, JSONObject fields) { } if( term != null ) { + // LOGGER.info("====================="); JSONObject allForms = (JSONObject) Utils.getJSONObject((JSONObject) fields.get(field), "summary.term_forms", new JSONObject()); JSONArray termForms = (JSONArray) allForms.get(term); @@ -326,7 +334,7 @@ public boolean apply(JSONObject inputData, JSONObject fields) { terms.addAll(termForms); JSONObject options = (JSONObject) Utils.getJSONObject((JSONObject) fields.get(field), - "term_analysis"); + "term_analysis", new JSONObject()); return applyOperator(Utils.termMatches(inputData.get(field).toString(), terms, options)); } diff --git a/src/main/java/org/bigml/binding/localmodel/Tree.java b/src/main/java/org/bigml/binding/localmodel/Tree.java index f9eb2a8..f0eb7a6 100755 --- a/src/main/java/org/bigml/binding/localmodel/Tree.java +++ b/src/main/java/org/bigml/binding/localmodel/Tree.java @@ -1,6 +1,6 @@ /* * Tree structure for the BigML local Model - * + * * This module defines an auxiliary Tree structure that is used in the local Model * to make predictions locally or embedded into your application without needing * to send requests to BigML.io. @@ -28,7 +28,7 @@ /** * A tree-like predictive model. - * + * */ public class Tree extends AbstractTree { @@ -36,17 +36,17 @@ public class Tree extends AbstractTree { * Logging */ static Logger LOGGER = LoggerFactory.getLogger(Tree.class.getName()); - + final static String INDENT = " "; final static double DEFAULT_RZ = 1.96; final static int BINS_LIMIT = 32; - + private static final JSONObject languageConversions; static { InputStream input = Tree.class.getResourceAsStream("/org/bigml/binding/localmodel/languageConversions.json"); languageConversions = (JSONObject) JSONValue.parse(new InputStreamReader(input)); } - + // Map operator str to its corresponding java operator static HashMap JAVA_OPERATOR = new HashMap(); static { @@ -91,8 +91,8 @@ public class Tree extends AbstractTree { JAVA_OPERATOR.put(Constants.OPTYPE_DATETIME + "-" + Constants.OPERATOR_GT, "\"{2}\".compareTo({3})>0"); } - - + + private String parentId; private final List children; private JSONObject rootDistribution; @@ -109,18 +109,18 @@ public class Tree extends AbstractTree { private Integer max; private Integer min; private JSONObject treeInfo; - + /** * Constructor */ public Tree(final JSONObject tree, final JSONObject fields, final Object objectiveField, final JSONObject rootDistribution, - final String parentId, final Map idsMap, + final String parentId, final Map idsMap, final boolean subtree, JSONObject treeInfo) { - + super(tree, fields, objectiveField); this.rootDistribution = rootDistribution; - + if( tree.containsKey("id") ) { this.parentId = parentId; @@ -129,31 +129,31 @@ public Tree(final JSONObject tree, final JSONObject fields, idsMap.put(id, this); } } - + children = new ArrayList(); JSONArray childrenObj = (JSONArray) tree.get("children"); if (childrenObj != null) { for (int i = 0; i < childrenObj.size(); i++) { JSONObject child = (JSONObject) childrenObj.get(i); - Tree childTree = new Tree(child, fields, objectiveField, + Tree childTree = new Tree(child, fields, objectiveField, null, id, idsMap, subtree, treeInfo); children.add(childTree); } } - + this.regression = isRegression(); - boolean treeRegression = treeInfo.get("regression")!=null ? + boolean treeRegression = treeInfo.get("regression")!=null ? (Boolean) treeInfo.get("regression") : true; treeInfo.put("regression", this.regression && treeRegression); this.regression = (Boolean) treeInfo.get("regression"); - + this.confidence = tree.containsKey("confidence") ? ((Number) tree.get("confidence")).doubleValue(): null; this.distribution = null; this.distributionUnit = null; this.weighted = false; - + JSONArray distributionObj = (JSONArray) tree.get("distribution"); JSONObject summary = null; if (distributionObj != null) { @@ -161,7 +161,7 @@ public Tree(final JSONObject tree, final JSONObject fields, } else if( tree.get("objective_summary") != null ) { summary = (JSONObject) tree.get("objective_summary"); extractDistribution(summary); - + if (tree.get("weighted_objective_summary") != null) { summary = (JSONObject) tree.get( "weighted_objective_summary"); @@ -176,7 +176,7 @@ public Tree(final JSONObject tree, final JSONObject fields, .get("categories"); this.weightedDistributionUnit = "categories"; } - + this.weight = ((Number) tree.get("weight")).doubleValue(); this.weighted = true; } @@ -184,12 +184,12 @@ public Tree(final JSONObject tree, final JSONObject fields, summary = rootDistribution; extractDistribution(summary); } - + if( this.regression ) { - treeInfo.put("max_bins", - Math.max(((Number) treeInfo.get("max_bins")).intValue(), + treeInfo.put("max_bins", + Math.max(((Number) treeInfo.get("max_bins")).intValue(), distribution.size())); - + median = null; if( summary != null ) { median = ((Number) summary.get("median")).doubleValue(); @@ -198,7 +198,7 @@ public Tree(final JSONObject tree, final JSONObject fields, if( median == null ) { median = distributionMedian(distribution, count); } - + if (summary.containsKey("maximum")) { max = ((Number) summary.get("maximum")).intValue(); } else { @@ -208,7 +208,7 @@ public Tree(final JSONObject tree, final JSONObject fields, max = Math.max(max, ((Number) dist.get(0)).intValue()); } } - + if (summary.containsKey("minimum")) { min = ((Number) summary.get("minimum")).intValue(); } else { @@ -219,19 +219,19 @@ public Tree(final JSONObject tree, final JSONObject fields, } } } - + if( !this.regression && this.distribution != null ) { impurity = calculateGiniImpurity(); } - + this.treeInfo = treeInfo; } - - + + public String getParentId() { return parentId; } - + public Double getMedian() { return median; } @@ -251,38 +251,38 @@ public JSONArray getDistribution() { public String getDistributionUnit() { return distributionUnit; } - + public Integer getMaxBins() { return (Integer) treeInfo.get("max_bins"); } - + public List getChildren() { return children; } - + public Boolean getWeighted() { return weighted; } - + public Integer getMin() { return min; } - + public Integer getMax() { return max; } - - + + /** * Creates a copy of the current tree node * * @return the copy of the tree node */ protected Tree clone() { - return new Tree(tree, fields, objectiveField, rootDistribution, + return new Tree(tree, fields, objectiveField, rootDistribution, id, null, false, treeInfo); } - + private void extractDistribution(JSONObject summary) { if (summary.get("bins") != null) { this.distribution = (JSONArray) summary.get("bins"); @@ -296,7 +296,7 @@ private void extractDistribution(JSONObject summary) { this.distributionUnit = "categories"; } } - + /** * Checks if the node's value is a category * @@ -329,7 +329,7 @@ public boolean isRegression() { return true; } - + /** * Returns the median value for a distribution * @@ -339,7 +339,7 @@ protected Double distributionMedian(JSONArray distribution, Long count) { Double previousValue = null; for (Object binInfo : distribution) { Double value = ((Number) ((JSONArray) binInfo).get(0)).doubleValue(); - + counter += ((Number) ((JSONArray) binInfo).get(1)).intValue(); if( counter > (count / 2) ) { if( (count % 2 == 0) && ((counter - 1) == (count / 2)) && @@ -349,13 +349,13 @@ protected Double distributionMedian(JSONArray distribution, Long count) { return value; } - + previousValue = value; } return null; } - + /** * Returns the gini impurity score associated to the distribution in the node * @@ -373,7 +373,7 @@ protected Double calculateGiniImpurity() { return 1.0 - purity; } - + /** * Computes the variance error * @@ -424,15 +424,15 @@ protected double unbiasedSampleVariance(List distribution, Double dis return Double.NaN; } - - + + /** * Computes the mean of a distribution in the [[point, instances]] syntax */ protected double mean(List distribution) { double addition = 0.0f; double count = 0.0f; - + for (JSONArray bin : distribution) { double point = ((Number) bin.get(0)).doubleValue(); double instances = ((Number) bin.get(1)).doubleValue(); @@ -440,15 +440,15 @@ protected double mean(List distribution) { addition += point * instances; count += instances; } - + if( count > 1 ) { return addition / count; } - + return Double.NaN; } - - + + /** * Wilson score interval computation of the distribution for the prediction * @@ -520,8 +520,8 @@ public static double wsConfidence(Object prediction, wsSqrt = Math.sqrt((p * (1 - p) / n) + (z2 / (4 * n2))); return (p + (z2 / (2 * n)) - (z * wsSqrt)) / (1 + (z2 / n)); } - - + + /** * Returns a list that includes all the leaves of the tree. * @@ -561,7 +561,7 @@ protected List getLeaves(List path, TreeNodeFilter filter) { public List getLeaves(TreeNodeFilter filter) { return getLeaves(null, filter); } - + /** * Returns the information associated to each of the tree nodes in rows format */ @@ -625,11 +625,11 @@ public List getNodesInfo(List headers, boolean leavesOnly) { return rows; } - - + + /** * Translates a tree model into a set of IF-THEN rules. - * + * * @param depth * controls the size of indentation */ @@ -703,7 +703,7 @@ protected String generateRules( return rules; } - + /** * Filters the contents of a nodesList. If any of the nodes is in the * ids list, the rest of nodes are removed. If none is in the ids list @@ -731,8 +731,8 @@ protected List filterNodes(List nodesList, List ids, boolean return nodes; } - - + + /** * Prints out an IF-THEN rule version of the tree. */ @@ -769,11 +769,11 @@ public String rules(Predicate.RuleLanguage language, final List idsPath, } return generateRules(0, language, idsPath, subtree); } - - + + /** * Translate the model into a set of "if" java statements. - * + * */ public String getJavaBody(final List idsPath, final boolean subtree) { return getJavaBody(0, "", null, null, idsPath, subtree); @@ -863,24 +863,24 @@ protected String getJavaBody(final int depth, String body, List conditio return instructions; } - - - - - - - - - - - + + + + + + + + + + + /** * Makes a prediction based on a number of field values. - * + * * The input fields must be keyed by Id. - * + * * .predict({"petal length": 1}) - * + * */ public HashMap predict(final JSONObject inputData) { return predict(inputData, null, MissingStrategy.LAST_PREDICTION); @@ -889,11 +889,11 @@ public HashMap predict(final JSONObject inputData) { /** * Makes a prediction based on a number of field values. - * + * * The input fields must be keyed by Id. - * + * * .predict({"petal length": 1}) - * + * */ public Prediction predict(final JSONObject inputData, List path, MissingStrategy strategy) { @@ -917,7 +917,7 @@ public Prediction predict(final JSONObject inputData, List path, } } } - + Integer dMin = !this.regression ? null : this.min; Integer dMax = !this.regression ? null : this.max; @@ -929,7 +929,7 @@ public Prediction predict(final JSONObject inputData, List path, TreeHolder lastNode = new TreeHolder(); Map finalDistribution = predictProportional( inputData, lastNode, path, false, false); - + if( isRegression() ) { // singular case: // when the prediction is the one given in a 1-instance node @@ -946,11 +946,11 @@ public Prediction predict(final JSONObject inputData, List path, // when there's more instances, sort elements by their mean JSONArray distribution = Utils.convertDistributionMapToSortedArray(finalDistribution); - + String distributionUnit = (distribution.size() > BINS_LIMIT ? "bins" : "counts"); distribution = Utils.mergeBins(distribution, BINS_LIMIT); - + long totalInstances = calculateTotalInstances(distribution); double prediction = 0.0; @@ -959,11 +959,11 @@ public Prediction predict(final JSONObject inputData, List path, // where there's only one bin, there will be no error, but // we use a correction derived from the parent's error prediction = ((Number) ((JSONArray) distribution.get(0)).get(0)).doubleValue(); - + if (totalInstances < 2) { totalInstances = 1; } - + try { // some strange models can have nodes with no confidence confidence = lastNode.tree.getConfidence(); @@ -975,26 +975,31 @@ public Prediction predict(final JSONObject inputData, List path, confidence = regressionError(unbiasedSampleVariance(distribution, prediction), totalInstances, DEFAULT_RZ); } - + Integer dMin = !this.regression ? null : this.min; Integer dMax = !this.regression ? null : this.max; - + return new Prediction(prediction, confidence, totalInstances, distributionMedian(distribution, totalInstances), path, distribution, distributionUnit, - lastNode.getTree().getChildren(), + lastNode.getTree().getChildren(), dMin, dMax); } else { JSONArray distribution = Utils.convertDistributionMapToSortedArray(finalDistribution); long totalInstances = calculateTotalInstances(distribution); - return new Prediction(((JSONArray) distribution.get(0)).get(0), - wsConfidence(((JSONArray) distribution.get(0)).get(0), distribution, - totalInstances, DEFAULT_RZ), - totalInstances, null, - path, distribution, "categorical", - lastNode.getTree().getChildren(), null, null); + if (distribution.size() > 0) { + return new Prediction(((JSONArray) distribution.get(0)).get(0), + wsConfidence(((JSONArray) distribution.get(0)).get(0), distribution, + totalInstances, DEFAULT_RZ), + totalInstances, null, + path, distribution, "categorical", + lastNode.getTree().getChildren(), null, null); + } + else { + return new Prediction(); + } } } else { throw new UnsupportedOperationException( @@ -1018,20 +1023,20 @@ public Prediction predict(final JSONObject inputData, List path, protected Map predictProportional( final JSONObject inputData, final TreeHolder lastNode, List path, Boolean missingFound, Boolean median) { - + if( path == null ) { path = new ArrayList(); } Map finalDistribution = new HashMap(); - + // We are in a leaf node... the only thing we need to do is return distribution of the node as a Map object if( children.isEmpty() ) { lastNode.setTree(this); distribution = !this.weighted ? distribution : weightedDistribution; return Utils.mergeDistributions(new HashMap(), Utils.convertDistributionArrayToMap(distribution)); } - + String optype = (String) ((JSONObject) fields.get(split(children))).get("optype"); if( isOneBranch(children, inputData) || optype.equals("text") || optype.equals("items")) { for (Tree child : children) { @@ -1042,7 +1047,7 @@ protected Map predictProportional( } return child.predictProportional(inputData, lastNode, path, missingFound, median); } - } + } } else { // missing value found, the unique path stops missingFound = true; @@ -1050,7 +1055,7 @@ protected Map predictProportional( finalDistribution = Utils.mergeDistributions(finalDistribution, child.predictProportional(inputData, lastNode, path, missingFound, median)); } - + /* minimums = [] maximums = [] @@ -1073,20 +1078,20 @@ protected Map predictProportional( max(maximums) if maximums else None, self, population, self) */ - - + + return finalDistribution; } return null; } - - - - - - + + + + + + protected static class TreeHolder { private Tree tree; diff --git a/src/main/java/org/bigml/binding/utils/Utils.java b/src/main/java/org/bigml/binding/utils/Utils.java index 4742f20..c5c6815 100755 --- a/src/main/java/org/bigml/binding/utils/Utils.java +++ b/src/main/java/org/bigml/binding/utils/Utils.java @@ -47,8 +47,15 @@ import org.json.simple.JSONArray; import org.json.simple.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + public class Utils { + static Logger LOGGER = LoggerFactory.getLogger( + Utils.class.getName()); + // Headers static String JSON = "application/json; charset=utf-8"; @@ -346,7 +353,7 @@ private static T1 cast(T2 o, T1 d) { public static T getFromJSONOr(JSONObject json, String key, T def) { - T result = def; + T result = def; if (json.containsKey(key)) { Object obj = getJSONObject(json, key); if (obj != null) { @@ -368,6 +375,11 @@ public static JSONObject getFromJSONOr(JSONObject json, */ public static Object getJSONObject(JSONObject json, String path, Object defaultValue) { String field = path; + // LOGGER.info("*** path is " + path); + // if (json == null) { + // LOGGER.warn("*** the json IS NULL!!!"); + // } + if (path.indexOf(".") != -1) { field = path.substring(0, path.indexOf(".")); } @@ -399,12 +411,13 @@ public static Object getJSONObject(JSONObject json, String path, Object defaultV path = path.substring(path.indexOf(".") + 1, path.length()); if (path.length() > 0) { - return getJSONObject(json, path); + // LOGGER.info("*** calling getJSONObject with path " + path); + return getJSONObject(json, path, defaultValue); } - + return defaultValue; } - + /** * Inverts a dictionary changing keys per values * @@ -804,11 +817,13 @@ public static StringBuilder printDistribution(JSONArray distribution) { * @param newDistribution */ public static Map mergeDistributions(Map distribution, Map newDistribution) { - for (Object value : newDistribution.keySet()) { - if( !distribution.containsKey(value) ) { - distribution.put(value, 0); + if (newDistribution != null) { + for (Object value : newDistribution.keySet()) { + if( !distribution.containsKey(value) ) { + distribution.put(value, 0); + } + distribution.put(value, distribution.get(value).intValue() + newDistribution.get(value).intValue()); } - distribution.put(value, distribution.get(value).intValue() + newDistribution.get(value).intValue()); } return distribution; @@ -863,16 +878,18 @@ public static JSONArray convertDistributionMapToSortedArray(Map String opType = Constants.OPTYPE_NUMERIC; - for (Object key : distribution.keySet()) { - JSONArray element = new JSONArray(); - element.add(key); - element.add(distribution.get(key)); - newDistribution.add(element); - - if( key instanceof Number ) { - opType = Constants.OPTYPE_NUMERIC; - } else if( key instanceof String ) { - opType = Constants.OPTYPE_TEXT; + if (distribution != null) { + for (Object key : distribution.keySet()) { + JSONArray element = new JSONArray(); + element.add(key); + element.add(distribution.get(key)); + newDistribution.add(element); + + if( key instanceof Number ) { + opType = Constants.OPTYPE_NUMERIC; + } else if( key instanceof String ) { + opType = Constants.OPTYPE_TEXT; + } } } @@ -1041,23 +1058,23 @@ public static int fullTermMatch(String text, String fullTerm, boolean caseSensit */ public static int termMatchesTokens(String text, List formsList, boolean caseSensitive) { String expression = String.format("(\\b|_)%s(\\b|_)", Utils.join(formsList, "(\\b|_)|(\\b|_)")); - Pattern pattern = Pattern.compile(expression, (caseSensitive ? Pattern.UNICODE_CASE : + Pattern pattern = Pattern.compile(Pattern.quote(expression), (caseSensitive ? Pattern.UNICODE_CASE : (Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE))); Matcher matcher = pattern.matcher(text); return (matcher.find() ? matcher.groupCount() : 0); } - - + + /** - * Checks the operating point contents and extracts and array with + * Checks the operating point contents and extracts and array with * the three defined variables */ public static Object[] parseOperatingPoint(JSONObject operatingPoint, String[] operatingKinds, List classNames) { - + String kind, positiveClass; Double threshold; - + if (!operatingPoint.containsKey("kind")) { throw new IllegalArgumentException( "Failed to find the kind of operating point."); @@ -1070,7 +1087,7 @@ public static Object[] parseOperatingPoint(JSONObject operatingPoint, StringUtils.join(operatingKinds,","))); } } - + if (!operatingPoint.containsKey("threshold")) { throw new IllegalArgumentException( "Failed to find the threshold of the operating point."); @@ -1080,7 +1097,7 @@ public static Object[] parseOperatingPoint(JSONObject operatingPoint, throw new IllegalArgumentException( "The threshold value should be in the 0 to 1 range."); } - + if (!operatingPoint.containsKey("positive_class")) { throw new IllegalArgumentException( "The operating point needs to have a positive_class" + @@ -1094,20 +1111,20 @@ public static Object[] parseOperatingPoint(JSONObject operatingPoint, StringUtils.join(classNames,","))); } } - + return new Object[] {kind, threshold, positiveClass}; } - + /** * Checks whether some numeric fields are missing in the input data */ public static void checkNoMissingNumerics( JSONObject inputData, JSONObject fields, String weightField) { - + for (Object fieldId : fields.keySet()) { JSONObject field = (JSONObject) fields.get(fieldId); String optype = (String) Utils.getJSONObject(field, "optype"); - if ("numeric".equals(optype) && + if ("numeric".equals(optype) && !inputData.containsKey((String) fieldId) && (weightField == null || !weightField.equals((String) fieldId))) { throw new IllegalArgumentException( @@ -1116,8 +1133,8 @@ public static void checkNoMissingNumerics( } } } - - + + /** * Checks whether some input fields are missing in the input data * while not training data has no missings in that field @@ -1181,22 +1198,22 @@ public static JSONArray inverseMatrix(JSONArray matrix) { /** * Sorts list of predictions - * + * */ - public static void sortPredictions(JSONArray predictions, + public static void sortPredictions(JSONArray predictions, final String primaryKey, final String secondaryKey) { - + Collections.sort(predictions, new Comparator() { @Override public int compare(JSONObject o1, JSONObject o2) { Double o1p = (Double) o1.get(primaryKey); Double o2p = (Double) o2.get(primaryKey); - + if (o1p.doubleValue() == o2p.doubleValue()) { return ((String) o1.get(secondaryKey)). compareTo(((String) o2.get(secondaryKey))); } - + return o2p.compareTo(o1p); } }); diff --git a/src/test/java/org/bigml/binding/AnomaliesStepdefs.java b/src/test/java/org/bigml/binding/AnomaliesStepdefs.java index 46a487d..8c21730 100755 --- a/src/test/java/org/bigml/binding/AnomaliesStepdefs.java +++ b/src/test/java/org/bigml/binding/AnomaliesStepdefs.java @@ -3,7 +3,6 @@ import cucumber.annotation.en.Given; import cucumber.annotation.en.Then; import cucumber.annotation.en.When; -import org.bigml.binding.resources.AbstractResource; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.JSONValue; @@ -29,12 +28,12 @@ public class AnomaliesStepdefs { @Autowired private ContextRepository context; + @Given("^I create an anomaly detector from a dataset list$") public void I_create_an_anomaly_from_a_dataset_list() throws AuthenticationException { JSONObject args = new JSONObject(); args.put("tags", Arrays.asList("unitTest")); -// args.put("missing_splits", false); - + assertTrue("No datasets found!", context.datasets != null && context.datasets.size() > 0); List datasetsIds = new ArrayList(); @@ -50,7 +49,6 @@ public void I_create_an_anomaly_from_a_dataset_list() throws AuthenticationExcep commonSteps.the_resource_has_been_created_with_status(context.status); } - @Then("^I create an anomaly detector of (\\d+) anomalies from a dataset$") public void i_create_an_anomaly_with_top_n_from_dataset(int topN) throws AuthenticationException { @@ -73,17 +71,7 @@ public void i_create_an_anomaly_with_top_n_from_dataset(int topN) public void I_create_a_local_anomaly_detector() throws Exception { localAnomaly = new LocalAnomaly(context.anomaly); } - - @Given("^I wait until the anomaly detector is ready less than (\\d+) secs and I return it$") - public JSONObject I_wait_until_the_anomaly_is_ready_less_than_secs_and_return( - int secs) throws Throwable { - commonSteps.I_wait_until_resource_status_code_is( - "anomaly detector", - AbstractResource.FINISHED, - AbstractResource.FAULTY, secs); - return context.anomaly; - } - + @Given("^I check the anomaly detector stems from the original dataset list$") public void i_check_anomaly_dataset_and_datasets_list () throws AuthenticationException { String[] datasetIds = (String[]) context.datasets.toArray(new String[context.datasets.size()]); @@ -104,44 +92,7 @@ public void i_check_anomaly_dataset_and_datasets_ids () throws AuthenticationExc assertEquals(datasetId, anomalyDatasetId); } - - @Given("^I create an anomaly detector with \"(.*)\"$") - public void I_create_an_anomaly_with_params(String args) throws Throwable { - String datasetId = (String) context.dataset.get("resource"); - JSONObject argsJSON = (JSONObject) JSONValue.parse(args); - - if( argsJSON != null ) { - if (argsJSON.containsKey("tags")) { - ((JSONArray) argsJSON.get("tags")).add("unitTest"); - } else { - argsJSON.put("tags", Arrays.asList("unitTest")); - } - -// if( !argsJSON.containsKey("missing_splits") ) { -// argsJSON.put("missing_splits", false); -// } - } else { - argsJSON = new JSONObject(); - argsJSON.put("tags", Arrays.asList("unitTest")); -// argsJSON.put("missing_splits", false); - } - - JSONObject resource = context.api.createAnomaly(datasetId, - argsJSON, 5, null); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.anomaly = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - @When("^I download the created batch anomaly score file to \"([^\"]*)\"$") - public void I_download_the_created_batch_anomaly_score_file_to(String fileTo) - throws Throwable { - context.api.downloadBatchAnomalyScore( - context.batchAnomalyScore, fileTo); - - } - + @When("^I create an anomaly score for \"(.*)\"$") public void I_create_an_anomaly_score(String data) throws Throwable { diff --git a/src/test/java/org/bigml/binding/BatchPredictionsStepdefs.java b/src/test/java/org/bigml/binding/BatchPredictionsStepdefs.java index 0b0a17a..502f8ef 100755 --- a/src/test/java/org/bigml/binding/BatchPredictionsStepdefs.java +++ b/src/test/java/org/bigml/binding/BatchPredictionsStepdefs.java @@ -30,47 +30,18 @@ public class BatchPredictionsStepdefs { @Autowired private ContextRepository context; - - @When("^I create a batch prediction for the dataset with the model$") - public void I_create_a_batch_prediction_for_the_dataset_with_the_model() - throws Throwable { - String modelId = (String) context.model.get("resource"); - I_create_a_batch_prediction_for_the_dataset_with(modelId); - } - - @When("^I create a batch prediction for the dataset with the ensemble$") - public void I_create_a_batch_prediction_for_the_dataset_with_the_ensemble() - throws Throwable { - String ensembleId = (String) context.ensemble.get("resource"); - I_create_a_batch_prediction_for_the_dataset_with(ensembleId); - } - - @When("^I create a batch prediction for the dataset with the logistic regression$") - public void I_create_a_batch_prediction_for_the_dataset_with_the_logistic_regression() - throws Throwable { - String logisticRegresionId = (String) context.logisticRegression.get("resource"); - I_create_a_batch_prediction_for_the_dataset_with(logisticRegresionId); - } - @When("^I create a batch prediction for the dataset with the linear regression$") - public void I_create_a_batch_prediction_for_the_dataset_with_the_linear_regression() - throws Throwable { - String linearRegresionId = (String) context.linearRegression.get("resource"); - I_create_a_batch_prediction_for_the_dataset_with(linearRegresionId); - } + private String downloadedFile; - @When("^I create a batch prediction for the dataset with the fusion$") - public void I_create_a_batch_prediction_for_the_dataset_with_the_fusion() - throws Throwable { - String fusionId = (String) context.fusion.get("resource"); - I_create_a_batch_prediction_for_the_dataset_with(fusionId); - } - - - public void I_create_a_batch_prediction_for_the_dataset_with(String resourceId) - throws Throwable { - String datasetId = (String) context.dataset.get("resource"); + @When("^I create a batch prediction for the dataset with the (model|ensemble|logisticregression|linearregression|fusion)$") + public void I_create_a_batch_prediction_for_the_with_the_model(String resourceName) + throws Throwable { + + String datasetId = (String) context.dataset.get("resource"); + String resourceId = (String) + commonSteps.getResource(resourceName).get("resource"); + JSONObject args = new JSONObject(); args.put("tags", Arrays.asList("unitTest")); @@ -83,14 +54,6 @@ public void I_create_a_batch_prediction_for_the_dataset_with(String resourceId) } - - @When("^I download the created predictions file to \"([^\"]*)\"$") - public void I_download_the_created_predictions_file_to(String fileTo) - throws Throwable { - context.api.downloadBatchPrediction( - context.batchPrediction, fileTo); - } - @Then("^the batch prediction file \"([^\"]*)\" is like \"([^\"]*)\"$") public void the_batch_prediction_file_is_like(String downloadedFile, String checkFile) throws Throwable { @@ -105,9 +68,38 @@ public void the_batch_prediction_file_is_like(String downloadedFile, if (!localCvs.equals(checkCvs)) { throw new Exception(); } + } + + @Then("^I create a batch prediction for \"(.*)\" and save it in \"(.*)\"$") + public void I_create_a_batch_prediction_and_save_it_in(String dataInput, + String path) throws Throwable { + + if( !new File(path).exists() ) { + new File(path).mkdirs(); + } + JSONArray inputDataList = dataInput != null ? + (JSONArray) JSONValue.parse(dataInput) : null; + + context.multiModel.batchPredict(inputDataList, path); } + + @Then("^I create a source from the batch prediction$") + public void I_create_a_source_from_the_batch_prediction() throws Throwable { + + String batchPredictionId = (String) context.batchPrediction.get("resource"); + assertNotNull("A batch prediction id is needed.", batchPredictionId); + + JSONObject source = context.api.createSourceFromBatchPrediction( + batchPredictionId, new JSONObject()); + + Integer code = (Integer) source.get("code"); + assertEquals(AbstractResource.HTTP_CREATED, code.intValue()); + context.location = (String) source.get("location"); + context.source = (JSONObject) source.get("object"); + } + @When("^I create a batch anomaly score$") public void I_create_a_batch_prediction_with_anomaly() throws Throwable { @@ -128,35 +120,78 @@ public void I_create_a_batch_prediction_with_anomaly() throws Throwable { context.batchAnomalyScore = (JSONObject) resource.get("object"); commonSteps.the_resource_has_been_created_with_status(context.status); } + + + @When("^I create a batch centroid for the dataset$") + public void I_create_a_batch_centroid_for_the_dataset() throws Throwable { + String clusterId = (String) context.cluster.get("resource"); + String datasetId = (String) context.dataset.get("resource"); + JSONObject args = new JSONObject(); + args.put("tags", Arrays.asList("unitTest")); - @Then("^I create a batch prediction for \"(.*)\" and save it in \"(.*)\"$") - public void I_create_a_batch_prediction_and_save_it_in(String dataInput, - String path) throws Throwable { - - if( !new File(path).exists() ) { - new File(path).mkdirs(); - } + JSONObject resource = context.api.createBatchCentroid( + clusterId, datasetId, args, 5, null); + context.status = (Integer) resource.get("code"); + context.location = (String) resource.get("location"); + context.batchCentroid = (JSONObject) resource.get("object"); + commonSteps.the_resource_has_been_created_with_status(context.status); + } + + @When("^I create a batch projection for the dataset with the pca$") + public void I_create_a_batch_projection_for_the_dataset_with_the_pca() + throws Throwable { + String pcaId = (String) context.pca.get("resource"); + String datasetId = (String) context.dataset.get("resource"); - JSONArray inputDataList = dataInput != null ? (JSONArray) JSONValue.parse(dataInput) - : null; + JSONObject args = new JSONObject(); + args.put("tags", Arrays.asList("unitTest")); - context.multiModel.batchPredict(inputDataList, path); + JSONObject resource = context.api.createBatchProjection( + pcaId, datasetId, args, 5, 3); + context.status = (Integer) resource.get("code"); + context.location = (String) resource.get("location"); + context.batchProjection = (JSONObject) resource.get("object"); + commonSteps.the_resource_has_been_created_with_status(context.status); + } + + + @When("^I download the created (predictions|batch anomaly score|centroid|projections) file to \"([^\"]*)\"$") + public void I_download_the_created_projections_file_to( + String resource, String fileTo) throws Throwable { + downloadedFile = fileTo; + + if (resource.equals("predictions")) { + context.api.downloadBatchPrediction( + context.batchPrediction, fileTo); + } + if (resource.equals("batch anomaly score")) { + context.api.downloadBatchAnomalyScore( + context.batchAnomalyScore, fileTo); + } + if (resource.equals("centroid")) { + context.api.downloadBatchCentroid(context.batchCentroid, + fileTo); + } + if (resource.equals("projections")) { + context.api.downloadBatchProjection( + context.batchProjection, fileTo); + } + } - @Then("^I create a source from the batch prediction$") - public void I_create_a_source_from_the_batch_prediction() throws Throwable { - - String batchPredictionId = (String) context.batchPrediction.get("resource"); - assertNotNull("A batch prediction id is needed.", batchPredictionId); + @Then("^the batch (centroid|projection) file is like \"([^\"]*)\"$") + public void the_batch_projection_file_is_like( + String resource, String checkFile) throws Throwable { + FileInputStream downloadFis = new FileInputStream(new File( + downloadedFile)); + FileInputStream checkFis = new FileInputStream(new File(checkFile)); - JSONObject source = context.api.createSourceFromBatchPrediction( - batchPredictionId, new JSONObject()); + String localCvs = Utils.inputStreamAsString(downloadFis, "UTF-8"); + String checkCvs = Utils.inputStreamAsString(checkFis, "UTF-8"); - Integer code = (Integer) source.get("code"); - assertEquals(AbstractResource.HTTP_CREATED, code.intValue()); - context.location = (String) source.get("location"); - context.source = (JSONObject) source.get("object"); + if (!localCvs.equals(checkCvs)) { + throw new Exception(); + } } - } \ No newline at end of file diff --git a/src/test/java/org/bigml/binding/ClustersStepdefs.java b/src/test/java/org/bigml/binding/ClustersStepdefs.java index 1a7c471..c64a49a 100755 --- a/src/test/java/org/bigml/binding/ClustersStepdefs.java +++ b/src/test/java/org/bigml/binding/ClustersStepdefs.java @@ -3,8 +3,6 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import java.io.File; -import java.io.FileInputStream; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.Arrays; @@ -13,7 +11,6 @@ import java.util.GregorianCalendar; import org.bigml.binding.resources.AbstractResource; -import org.bigml.binding.utils.Utils; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.JSONValue; @@ -36,8 +33,6 @@ public class ClustersStepdefs { @Autowired private ContextRepository context; - private String downloadedFile; - @Given("^I create a cluster with options \"(.*)\"$") public void I_create_a_cluster_with_options(String options) throws AuthenticationException { String datasetId = (String) context.dataset.get("resource"); @@ -80,16 +75,6 @@ public void I_create_a_local_cluster() throws Exception { context.localCluster = new LocalCluster(context.cluster); } - @Given("^I wait until the cluster is ready less than (\\d+) secs and I return it$") - public JSONObject I_wait_until_the_cluster_is_ready_less_than_secs_and_return( - int secs) throws Throwable { - commonSteps.I_wait_until_resource_status_code_is( - "cluster", - AbstractResource.FINISHED, - AbstractResource.FAULTY, secs); - return context.cluster; - } - @Given("^I get the cluster \"(.*)\"") public void I_get_the_cluster(String clusterId) throws AuthenticationException { @@ -181,45 +166,6 @@ public void the_centroid_is(String result) throws AuthenticationException { assertEquals(result, context.centroid.get("centroid_name")); } - @When("^I create a batch centroid for the dataset$") - public void I_create_a_batch_centroid_for_the_dataset() throws Throwable { - String clusterId = (String) context.cluster.get("resource"); - String datasetId = (String) context.dataset.get("resource"); - - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - - JSONObject resource = context.api.createBatchCentroid( - clusterId, datasetId, args, 5, null); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.batchCentroid = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - @When("^I download the created centroid file to \"([^\"]*)\"$") - public void I_download_the_created_centroid_file_to(String fileTo) - throws Throwable { - downloadedFile = fileTo; - - context.api.downloadBatchCentroid(context.batchCentroid, - fileTo); - } - - @Then("^the batch centroid file is like \"([^\"]*)\"$") - public void the_batch_centroid_file_is_like(String checkFile) - throws Throwable { - FileInputStream downloadFis = new FileInputStream(new File( - downloadedFile)); - FileInputStream checkFis = new FileInputStream(new File(checkFile)); - - String localCvs = Utils.inputStreamAsString(downloadFis, "UTF-8"); - String checkCvs = Utils.inputStreamAsString(checkFis, "UTF-8"); - - if (!localCvs.equals(checkCvs)) { - throw new Exception(); - } - } @Then("^the data point in the cluster closest to \"(.*)\" is \"(.*)\"$") public void closest_in_cluster(String reference, String closest) throws Throwable { diff --git a/src/test/java/org/bigml/binding/CommonStepdefs.java b/src/test/java/org/bigml/binding/CommonStepdefs.java index d006a26..131f5d6 100755 --- a/src/test/java/org/bigml/binding/CommonStepdefs.java +++ b/src/test/java/org/bigml/binding/CommonStepdefs.java @@ -27,7 +27,7 @@ public class CommonStepdefs { - private static final Map RES_NAMES = new HashMap(); + public static final Map RES_NAMES = new HashMap(); static { RES_NAMES.put("anomaly detector", "anomaly"); RES_NAMES.put("anomaly score", "anomalyScore"); @@ -116,12 +116,12 @@ private Method getClientMethod(String operation, String resourceName) { return method; } - private JSONObject getResource(String resourceName) + protected JSONObject getResource(String resourceName) throws IllegalAccessException { return (JSONObject) getField(resourceName).get(context); } - private void setResource(String resourceName, JSONObject resource) + protected void setResource(String resourceName, JSONObject resource) throws IllegalAccessException { getField(resourceName).set(context, resource); } diff --git a/src/test/java/org/bigml/binding/DatasetsStepdefs.java b/src/test/java/org/bigml/binding/DatasetsStepdefs.java index f551d08..ec508ad 100755 --- a/src/test/java/org/bigml/binding/DatasetsStepdefs.java +++ b/src/test/java/org/bigml/binding/DatasetsStepdefs.java @@ -157,7 +157,6 @@ public void I_create_a_dataset_extracting_a_sample(double rate) String datasetId = (String) context.dataset.get("resource"); - JSONObject args = new JSONObject(); args.put("tags", Arrays.asList("unitTest")); args.put("sample_rate", rate); @@ -262,7 +261,6 @@ public void the_dataset_file_is_like(String downloadedFile, } - @Then("^I create a dataset associated to centroid \"(.*)\"$") public void I_create_a_dataset_associated_to_centroid(String centroidId) throws Throwable { diff --git a/src/test/java/org/bigml/binding/EvaluationsStepdefs.java b/src/test/java/org/bigml/binding/EvaluationsStepdefs.java index 9768970..f7ee678 100755 --- a/src/test/java/org/bigml/binding/EvaluationsStepdefs.java +++ b/src/test/java/org/bigml/binding/EvaluationsStepdefs.java @@ -1,20 +1,21 @@ package org.bigml.binding; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import java.util.Arrays; -import org.bigml.binding.utils.Utils; import org.json.simple.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; -import cucumber.annotation.en.Given; import cucumber.annotation.en.Then; import cucumber.annotation.en.When; +import org.bigml.binding.CommonStepdefs; +import org.bigml.binding.utils.Utils; + + public class EvaluationsStepdefs { // Logging @@ -25,78 +26,20 @@ public class EvaluationsStepdefs { @Autowired private ContextRepository context; - - @Given("^I create a evaluation$") - public void I_create_a_evaluation() throws AuthenticationException { - String modelId = (String) context.model.get("resource"); - String datasetId = (String) context.dataset.get("resource"); - - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - - JSONObject resource = context.api.createEvaluation( - modelId, datasetId, args, 5, 3); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.evaluation = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - @When("^I create an evaluation for the model with the dataset$") - public void I_create_an_evaluation_for_the_model_with_the_dataset() - throws Throwable { - I_create_a_evaluation(); - } - - @When("^I create an evaluation for the ensemble with the dataset$") - public void I_create_an_evaluation_for_the_ensemble_with_the_dataset() - throws Throwable { - String ensembleId = (String) context.ensemble.get("resource"); - String datasetId = (String) context.dataset.get("resource"); - - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - - JSONObject resource = context.api.createEvaluation( - ensembleId, datasetId, args, 5, 3); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.evaluation = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - - @When("^I create an evaluation for the logistic regression with the dataset$") - public void I_create_an_evaluation_for_the_logistic_regression_with_the_dataset() - throws Throwable { - - String logisticRegressionId = (String) context.logisticRegression.get("resource"); - String datasetId = (String) context.dataset.get("resource"); - - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - - JSONObject resource = context.api.createEvaluation( - logisticRegressionId, datasetId, args, 5, 3); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.evaluation = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - @When("^I create an evaluation for the linear regression with the dataset$") - public void I_create_an_evaluation_for_the_linear_regression_with_the_dataset() + @When("^I create an evaluation for the (model|ensemble|logisticregression|linearregression|fusion) with the dataset$") + public void I_create_an_evaluation_with_the_dataset(String resourceName) throws Throwable { - - String linearRegressionId = (String) context.linearRegression.get("resource"); - String datasetId = (String) context.dataset.get("resource"); - - JSONObject args = new JSONObject(); + + String datasetId = (String) context.dataset.get("resource"); + String modelId = (String) + commonSteps.getResource(resourceName).get("resource"); + + JSONObject args = new JSONObject(); args.put("tags", Arrays.asList("unitTest")); JSONObject resource = context.api.createEvaluation( - linearRegressionId, datasetId, args, 5, 3); + modelId, datasetId, args, 5, 3); context.status = (Integer) resource.get("code"); context.location = (String) resource.get("location"); context.evaluation = (JSONObject) resource.get("object"); @@ -104,37 +47,12 @@ public void I_create_an_evaluation_for_the_linear_regression_with_the_dataset() } - @When("^I create an evaluation for the fusion with the dataset$") - public void I_create_an_evaluation_for_the_fusion_with_the_dataset() - throws Throwable { - - String fusionId = (String) context.fusion.get("resource"); - String datasetId = (String) context.dataset.get("resource"); - - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - - JSONObject resource = context.api.createEvaluation( - fusionId, datasetId, args, 5, 3); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.evaluation = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - @Then("^the measured \"([^\"]*)\" is (\\d+)$") - public void the_measured_is(String measure, float value) throws Throwable { - Long measureLong = (Long) Utils.getJSONObject(context.evaluation, - "result.model." + measure); - assertTrue(measureLong.floatValue() == value); - } - @Then("^the measured \"([^\"]*)\" is equals to ([\\d,.]+)$") public void the_measured_is_equals_to(String measure, double value) throws Throwable { - double measureLong = (Double) Utils.getJSONObject(context.evaluation, - "result.model." + measure); + double measureLong = ((Number) Utils.getJSONObject(context.evaluation, + "result.model." + measure)).doubleValue(); assertEquals(measureLong, value, 0.00001); } diff --git a/src/test/java/org/bigml/binding/ExecutionsStepdefs.java b/src/test/java/org/bigml/binding/ExecutionsStepdefs.java index 52b7aa2..8b5dfcd 100755 --- a/src/test/java/org/bigml/binding/ExecutionsStepdefs.java +++ b/src/test/java/org/bigml/binding/ExecutionsStepdefs.java @@ -16,58 +16,66 @@ public class ExecutionsStepdefs { - // Logging - Logger logger = LoggerFactory.getLogger(ExecutionsStepdefs.class); - - @Autowired - CommonStepdefs commonSteps; - - @Autowired - private ContextRepository context; - - @Given("^I create a whizzml script execution from an existing script$") - public void I_create_a_whizzml_script_execution_from_an_existing_script() - throws AuthenticationException { - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - - String scriptId = (String) context.script.get("resource"); - JSONObject resource = context.api.createExecution(scriptId, args, 5, null); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.execution = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - @Given("^I create a whizzml script execution from the last two scripts$") - public void I_create_a_whizzml_script_execution_from_the_last_two_scripts() throws Throwable { - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - JSONObject resource = context.api.createExecution(context.scriptsIds, args, 5, null); - - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.execution = (JSONObject) resource.get("object"); - - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - @Given("^I reset scripts$") - public void I_reset_scripts() throws AuthenticationException { - context.scriptsIds = new ArrayList(); - } - - @Then("^the script id is correct and the result is \"([^\"]*)\"$") - public void the_script_id_is_correct_and_the_result_is(Long expectedResult) throws Throwable { - assertEquals(context.script.get("resource"), context.execution.get("script")); - - Long result = (Long) Utils.getJSONObject(context.execution, "execution.result"); - assertEquals(expectedResult, result); - } - - @Then("^the result is \"([^\"]*)\"$") - public void the_value_of_is_and_the_result_is(String expectedResult) throws Throwable { - JSONArray result = (JSONArray) Utils.getJSONObject(context.execution, "execution.results"); - assertEquals(expectedResult, result.toString()); - } + // Logging + Logger logger = LoggerFactory.getLogger(ExecutionsStepdefs.class); + + @Autowired + CommonStepdefs commonSteps; + + @Autowired + private ContextRepository context; + + @Given("^I create a whizzml script execution from an existing script$") + public void I_create_a_whizzml_script_execution_from_an_existing_script() + throws AuthenticationException { + JSONObject args = new JSONObject(); + args.put("tags", Arrays.asList("unitTest")); + + String scriptId = (String) context.script.get("resource"); + JSONObject resource = context.api.createExecution(scriptId, args, 5, + null); + context.status = (Integer) resource.get("code"); + context.location = (String) resource.get("location"); + context.execution = (JSONObject) resource.get("object"); + commonSteps.the_resource_has_been_created_with_status(context.status); + } + + @Given("^I create a whizzml script execution from the last two scripts$") + public void I_create_a_whizzml_script_execution_from_the_last_two_scripts() + throws Throwable { + JSONObject args = new JSONObject(); + args.put("tags", Arrays.asList("unitTest")); + JSONObject resource = context.api.createExecution(context.scriptsIds, + args, 5, null); + + context.status = (Integer) resource.get("code"); + context.location = (String) resource.get("location"); + context.execution = (JSONObject) resource.get("object"); + + commonSteps.the_resource_has_been_created_with_status(context.status); + } + + @Given("^I reset scripts$") + public void I_reset_scripts() throws AuthenticationException { + context.scriptsIds = new ArrayList(); + } + + @Then("^the script id is correct and the result is \"([^\"]*)\"$") + public void the_script_id_is_correct_and_the_result_is(Long expectedResult) + throws Throwable { + assertEquals(context.script.get("resource"), + context.execution.get("script")); + + Long result = (Long) Utils.getJSONObject(context.execution, + "execution.result"); + assertEquals(expectedResult, result); + } + + @Then("^the result is \"([^\"]*)\"$") + public void the_value_of_is_and_the_result_is(String expectedResult) + throws Throwable { + JSONArray result = (JSONArray) Utils.getJSONObject(context.execution, + "execution.results"); + assertEquals(expectedResult, result.toString()); + } } \ No newline at end of file diff --git a/src/test/java/org/bigml/binding/ModelsStepdefs.java b/src/test/java/org/bigml/binding/ModelsStepdefs.java index a56045e..f1872b5 100755 --- a/src/test/java/org/bigml/binding/ModelsStepdefs.java +++ b/src/test/java/org/bigml/binding/ModelsStepdefs.java @@ -128,17 +128,7 @@ public void I_create_a_model() throws AuthenticationException { context.models.add(context.model); commonSteps.the_resource_has_been_created_with_status(context.status); } - - @Given("^I wait until the model is ready less than (\\d+) secs and I return it$") - public JSONObject I_wait_until_the_model_is_ready_less_than_secs_and_return( - int secs) throws Throwable { - commonSteps.I_wait_until_resource_status_code_is( - "model", - AbstractResource.FINISHED, - AbstractResource.FAULTY, secs); - return context.model; - } - + @Given("^I retrieve a list of remote models tagged with \"(.*)\"$") public void I_retrieve_a_list_of_remote_models_tagged_with(String tag) throws Throwable { diff --git a/src/test/java/org/bigml/binding/PcaStepdefs.java b/src/test/java/org/bigml/binding/PcaStepdefs.java index 2d22763..d780a27 100644 --- a/src/test/java/org/bigml/binding/PcaStepdefs.java +++ b/src/test/java/org/bigml/binding/PcaStepdefs.java @@ -3,8 +3,6 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import java.io.File; -import java.io.FileInputStream; import java.util.Arrays; import org.bigml.binding.utils.Utils; @@ -30,8 +28,6 @@ public class PcaStepdefs { @Autowired private ContextRepository context; - private String downloadedFile; - LocalPca localPca; @Given("^I create a pca with \"(.*)\"$") @@ -83,48 +79,6 @@ public void the_projection_is(String projection) throws Throwable { assertEquals(expected, actual); } - @When("^I create a batch projection for the dataset with the pca$") - public void I_create_a_batch_projection_for_the_dataset_with_the_pca() - throws Throwable { - String pcaId = (String) context.pca.get("resource"); - String datasetId = (String) context.dataset.get("resource"); - - JSONObject args = new JSONObject(); - args.put("tags", Arrays.asList("unitTest")); - - JSONObject resource = context.api.createBatchProjection( - pcaId, datasetId, args, 5, 3); - context.status = (Integer) resource.get("code"); - context.location = (String) resource.get("location"); - context.batchProjection = (JSONObject) resource.get("object"); - commonSteps.the_resource_has_been_created_with_status(context.status); - } - - - @When("^I download the created projections file to \"([^\"]*)\"$") - public void I_download_the_created_projections_file_to(String fileTo) - throws Throwable { - downloadedFile = fileTo; - - context.api.downloadBatchProjection( - context.batchProjection, fileTo); - } - - @Then("^the batch projection file is like \"([^\"]*)\"$") - public void the_batch_projection_file_is_like(String checkFile) - throws Throwable { - FileInputStream downloadFis = new FileInputStream(new File( - downloadedFile)); - FileInputStream checkFis = new FileInputStream(new File(checkFile)); - - String localCvs = Utils.inputStreamAsString(downloadFis, "UTF-8"); - String checkCvs = Utils.inputStreamAsString(checkFis, "UTF-8"); - - if (!localCvs.equals(checkCvs)) { - throw new Exception(); - } - } - @Given("^I create a local pca$") public void I_create_a_local_pca() throws Exception { diff --git a/src/test/java/org/bigml/binding/RunCukesTest.java b/src/test/java/org/bigml/binding/RunCukesTest.java index 0d03ff0..f52c2b8 100755 --- a/src/test/java/org/bigml/binding/RunCukesTest.java +++ b/src/test/java/org/bigml/binding/RunCukesTest.java @@ -11,44 +11,28 @@ @Cucumber.Options(format = { "pretty", "html:target/cucumber-html-report" }, glue = {"org.bigml.binding"}, features = { - "src/test/resources/test_01_prediction.feature", - "src/test/resources/test_03_local_prediction.feature", - "src/test/resources/test_04_multivote_prediction.feature", - "src/test/resources/test_05_compare_predictions.feature", - "src/test/resources/test_06_batch_predictions.feature", - "src/test/resources/test_07_multimodel_batch_predictions.feature", - "src/test/resources/test_08_multimodel.feature", - "src/test/resources/test_09_ensemble_prediction.feature", - "src/test/resources/test_10_local_ensemble_prediction.feature", - "src/test/resources/test_11_multimodel_prediction.feature", - "src/test/resources/test_12_public_model_prediction.feature", - "src/test/resources/test_13_public_dataset.feature", - "src/test/resources/test_14_create_evaluations.feature", - "src/test/resources/test_15_download_dataset.feature", - "src/test/resources/test_16_sample_dataset.feature", - "src/test/resources/test_17_split_dataset.feature", - "src/test/resources/test_18_create_anomaly.feature", - "src/test/resources/test_19_missing_and_errors.feature", - "src/test/resources/test_20_rename_duplicated_names.feature", - "src/test/resources/test_21_projects.feature", - "src/test/resources/test_24_cluster_derived.feature", - "src/test/resources/test_25_correlation.feature", - "src/test/resources/test_26_statistical_test.feature", - "src/test/resources/test_27_logistic_regression.feature", - "src/test/resources/test_28_associations.feature", - "src/test/resources/test_29_script.feature", - "src/test/resources/test_30_execution.feature", - "src/test/resources/test_31_library.feature", - "src/test/resources/test_32_topic_model_prediction.feature", - "src/test/resources/test_33_compare_predictions.feature", - "src/test/resources/test_34_timeseries.feature", - "src/test/resources/test_35_compare_predictions.feature", - "src/test/resources/test_36_compare_predictions.feature", - "src/test/resources/test_38_organization.feature", - "src/test/resources/test_39_optiml_fusion.feature", - "src/test/resources/test_42_pca.feature", - "src/test/resources/test_43_linear_regression.feature", + "src/test/resources/test_anomaly.feature", + "src/test/resources/test_association.feature", + "src/test/resources/test_batchpredictions.feature", + "src/test/resources/test_cluster.feature", "src/test/resources/test_configurations.feature", + "src/test/resources/test_correlation.feature", + "src/test/resources/test_dataset.feature", + "src/test/resources/test_deepnet.feature", + "src/test/resources/test_ensemble.feature", + "src/test/resources/test_evaluation.feature", + "src/test/resources/test_linearregression.feature", + "src/test/resources/test_logisticregression.feature", + "src/test/resources/test_model.feature", + "src/test/resources/test_optiml_fusion.feature", + //"src/test/resources/test_organization.feature", + "src/test/resources/test_pca.feature", + "src/test/resources/test_project.feature", + "src/test/resources/test_sample_dataset.feature", + "src/test/resources/test_statisticaltest.feature", + "src/test/resources/test_timeseries.feature", + "src/test/resources/test_topicmodel.feature", + "src/test/resources/test_whizzml.feature", "src/test/resources/delete_all_test_data.feature" }) public class RunCukesTest { } diff --git a/src/test/resources/test_01_prediction.feature b/src/test/resources/test_01_prediction.feature deleted file mode 100755 index b1dd8c6..0000000 --- a/src/test/resources/test_01_prediction.feature +++ /dev/null @@ -1,120 +0,0 @@ - -Feature: Create Predictions -In order to create a prediction -I need to create a model first - - Scenario Outline: Successfully creating a prediction: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - When I create a prediction for "" - Then the prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | - | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | - | data/iris_sp_chars.csv | 10 | 10 | 10 | {"pétal&width\u0000": 0.5} | 000004 | Iris-setosa | - - - Scenario Outline: Successfully creating a prediction from a source in a remote location: - Given I create a data source using the url "" - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - When I create a prediction for "" - Then the prediction for "" is "" - Then delete test data - - Examples: - | url | time_1 | time_2 | time_3 | data_input | objective | prediction | - | s3://bigml-public/csv/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | - - -# Scenario Outline: Successfully creating a prediction from a asynchronous uploaded file: -# Given I create a data source uploading a "" file in asynchronous mode -# And I wait until the source has been created less than secs -# And I wait until the source is ready less than secs -# And I create a dataset -# And I wait until the dataset is ready less than secs -# And I create a model -# And I wait until the model is ready less than secs -# When I create a prediction for "" -# Then the prediction for "" is "" -# Then delete test data - -# Examples: -# | data | time_1 | time_2 | time_3 | time_4 | data_input | objective | prediction | -# | data/iris.csv | 10 | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | - - - Scenario Outline: Successfully creating a prediction from inline data source: - Given I create a data source from inline data slurped from "" - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - When I create a prediction for "" - Then the prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | - | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | - - - Scenario Outline: Successfully creating a centroid and the associated dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a cluster - And I wait until the cluster is ready less than secs - When I create a centroid for "" - And I check the centroid is ok - Then the centroid is "" - And I create a dataset from the cluster and the centroid - And I wait until the dataset is ready less than secs - And I check that the dataset is created for the cluster and the centroid - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | centroid | - | data/diabetes.csv | 10 | 10 | 30 | {"pregnancies": 0, "plasma glucose": 118, "blood pressure": 84, "triceps skin thickness": 47, "insulin": 230, "bmi": 45.8, "diabetes pedigree": 0.551, "age": 31, "diabetes": "true"} | Cluster 3 | - - - Scenario Outline: Successfully creating an anomaly score: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an anomaly detector from a dataset - And I wait until the anomaly detector is ready less than secs - When I create an anomaly score for "" - Then the anomaly score is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | score | - | data/tiny_kdd.csv | 10 | 10 | 100 | {"src_bytes": 350} | 0.92846 | - | data/iris_sp_chars.csv | 10 | 10 | 100 | {"pétal&width\u0000": 300} | 0.89313 | - - - Scenario Outline: Successfully creating a topic model: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I update the source with "" waiting less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - When I create topic model from a dataset - Then I wait until the topic model is ready less than secs - - Examples: - | data | time_1 | time_2 | time_3 | options | - | data/movies.csv | 10 | 10 | 100 | {"fields": {"000007": {"optype": "items", "item_analysis": {"separator": "$"}}, "000006": {"optype": "text"}}} | diff --git a/src/test/resources/test_03_local_prediction.feature b/src/test/resources/test_03_local_prediction.feature deleted file mode 100755 index de55593..0000000 --- a/src/test/resources/test_03_local_prediction.feature +++ /dev/null @@ -1,41 +0,0 @@ -Feature: Create Predictions - In order to create a prediction - I need to create a model first - - Scenario Outline: Successfully creating a prediction from a local model in a json file: - Given I create a local model from a "" file - And the local prediction for "" is "" - And the confidence of the local prediction for "" is - Then delete test data - - Examples: - | model | data_input | prediction | confidence | - | data/iris_model.json | {"petal length": 0.5} | Iris-setosa | 0.90594 | - - - Scenario Outline: Successfully creating a multiple prediction from a local model in a json file: - Given I create a local model from a "" file - And the multiple local prediction for "" is "" - Then delete test data - - Examples: - | model | data_input | prediction | - | data/iris_model.json | {"petal length": 3} | [{"probability":0.5060240963855421,"confidence":0.4006020980792863,"prediction":"Iris-versicolor","count":42},{"probability":0.4939759036144578,"confidence":0.3890868795664999,"prediction":"Iris-virginica","count":41}] | - - - - Scenario Outline: Successfully creating a prediction from local model - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - Then the local prediction for "" is "" - Then the local prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | objective1 | prediction1 | objective2 | prediction2 | - | data/iris.csv | 15 | 15 | 15 | {"petal width": 0.5} | Iris-setosa | {"000003": 0.5} | Iris-setosa | diff --git a/src/test/resources/test_04_multivote_prediction.feature b/src/test/resources/test_04_multivote_prediction.feature deleted file mode 100755 index e43075f..0000000 --- a/src/test/resources/test_04_multivote_prediction.feature +++ /dev/null @@ -1,30 +0,0 @@ -Feature: Compute MultiVote predictions - In order compute combined predictions - I need to create a MultiVote object - - Scenario Outline: Successfully computing predictions combinations: - Given I create a MultiVote for the set of predictions in file - When I compute the prediction with confidence using method "" - Then the combined prediction is "" - And the confidence for the combined prediction is - Then delete test data - - Examples: - | predictions | method | prediction | confidence | - | data/predictions_c.json| 0 | a | 0.450471270879 | - | data/predictions_c.json| 1 | a | 0.552021302649 | - | data/predictions_c.json| 2 | a | 0.40363 | - - - Scenario Outline: Successfully computing predictions combinations: - Given I create a MultiVote for the set of predictions in file - When I compute the prediction with confidence using method "" - Then the numerical combined prediction is - And the confidence for the combined prediction is - Then delete test data - - Examples: - | predictions | method | prediction | confidence | - | data/predictions_r.json| 0 | 1.55555556667 | 0.400079152063 | - | data/predictions_r.json| 1 | 1.59376845074 | 0.248366474212 | - | data/predictions_r.json| 2 | 1.55555556667 | 0.400079152063 | diff --git a/src/test/resources/test_07_multimodel_batch_predictions.feature b/src/test/resources/test_07_multimodel_batch_predictions.feature deleted file mode 100755 index 61b8165..0000000 --- a/src/test/resources/test_07_multimodel_batch_predictions.feature +++ /dev/null @@ -1,26 +0,0 @@ -Feature: Create Batch Predictions from Multi Models - In order to create a prediction from a multi model - I need to create a multi model first - - Scenario Outline: Successfully creating a batch prediction from a multi model: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I retrieve a list of remote models tagged with "" - And I create a local multi model - When I create a batch prediction for "" and save it in "" - And I combine the votes in "" - Then the plurality combined predictions are "" - And the confidence weighted predictions are "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | params | tag | data_input | path | predictions | - | data/iris.csv | 10 | 10 | 10 | {"tags":["mytag"]} | mytag | [{"petal width": 0.5}, {"petal length": 6, "petal width": 2}, {"petal length": 4, "petal width": 1.5}] | data | ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] | diff --git a/src/test/resources/test_08_multimodel.feature b/src/test/resources/test_08_multimodel.feature deleted file mode 100755 index ed315bb..0000000 --- a/src/test/resources/test_08_multimodel.feature +++ /dev/null @@ -1,37 +0,0 @@ -Feature: Create a model from a dataset list - In order to create a model from a list of datasets - I need to create some datasets first - - Scenario Outline: Successfully creating a model from a dataset list: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I store the dataset id in a list - And I create a dataset - And I wait until the dataset is ready less than secs - And I store the dataset id in a list - Then I create a model from a dataset list - And I wait until the model is ready less than secs - And I check the model stems from the original dataset list - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | - | data/iris.csv | 30 | 30 | 30 | 30 | - - - Scenario Outline: Successfully creating a model from a dataset list and predicting with it using median: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - Then I create a model - And I wait until the model is ready less than secs - And I create a local multi model - And I create a local mm median batch prediction using "" with prediction - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | input_data | prediction | - | data/grades.csv | 30 | 30 | 30 | {"Tutorial": 99.47, "Midterm": 53.12, "TakeHome": 87.96} | 63.33 | diff --git a/src/test/resources/test_09_ensemble_prediction.feature b/src/test/resources/test_09_ensemble_prediction.feature deleted file mode 100755 index dc39d95..0000000 --- a/src/test/resources/test_09_ensemble_prediction.feature +++ /dev/null @@ -1,37 +0,0 @@ -Feature: Create Predictions from Ensembles - In order to create a prediction from an ensemble - I need to create an ensemble first - - Scenario Outline: Successfully creating a prediction from an ensemble: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble of models - And I wait until the ensemble is ready less than secs - When I create a prediction with ensemble for "" - And I wait until the prediction is ready less than secs - Then the prediction for "" is "" - Then delete test data - - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | number_of_models | data_input | objective | prediction | - | data/iris.csv | 10 | 10 | 100 | 20 | 5 | {"petal width": 0.5} | 000004 | Iris-versicolor | - | data/iris_sp_chars.csv | 10 | 10 | 100 | 20 | 5 | {"pétal&width\u0000": 0.5} | 000004 | Iris-versicolor | - - - Scenario Outline: Successfully creating a prediction from an ensemble: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble of models - And I wait until the ensemble is ready less than secs - When I create a prediction with ensemble for "" - And I wait until the prediction is ready less than secs - Then the numerical prediction for "" is - Then delete test data - - | data/grades.csv | 10 | 10 | 150 | 20 | 10 |{"Assignment": 81.22, "Tutorial": 91.95, "Midterm": 79.38, "TakeHome": 105.93} | 000005 | 84.556 | - | data/grades.csv | 10 | 10 | 150 | 20 | 10 |{"Assignment": 97.33, "Tutorial": 106.74, "Midterm": 76.88, "TakeHome": 108.89} | 000005 | 73.13558 | diff --git a/src/test/resources/test_10_local_ensemble_prediction.feature b/src/test/resources/test_10_local_ensemble_prediction.feature deleted file mode 100755 index f77f4fe..0000000 --- a/src/test/resources/test_10_local_ensemble_prediction.feature +++ /dev/null @@ -1,110 +0,0 @@ -Feature: Create Predictions locally from Ensembles - In order to create a local prediction from an ensemble - I need to create an Ensemble first - - Scenario Outline: Successfully creating a local prediction from an Ensemble: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble of models - And I wait until the ensemble is ready less than secs - And I create a local ensemble - When the local ensemble prediction for "" is "" with confidence - #And the local probabilities are "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | number_of_models | data_input |prediction | confidence | - | data/iris.csv | 50 | 50 | 50 | 5 | {"petal width": 0.5} | Iris-versicolor | 0.3687 | ["0.3403","0.4150","0.2447"] | - - - Scenario Outline: Successfully obtaining field importance from an Ensemble: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - When I create a local Ensemble with the last models - Then the field importance text is "" - Then delete test data - - Examples: - | data | time_1 | time_2 |parms1 | time_3 |parms2 | time_4 |parms3| time_5 |number_of_models |field_importance | - | data/iris.csv | 50 | 50 |{"input_fields": ["000000", "000001","000003", "000004"]} |20 |{"input_fields": ["000000", "000001","000002", "000004"]} | 20 |{"input_fields": ["000000", "000001","000002", "000003", "000004"]} | 20 | 3 |[["000002", 0.5269933333333333], ["000003", 0.38936], ["000000", 0.04662333333333333], ["000001", 0.037026666666666666]] | - - - Scenario Outline: Successfully creating a local prediction from an Ensemble adding confidence: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble of models - And I wait until the ensemble is ready less than secs - And I create a local ensemble - #When I create a local ensemble prediction for "" in JSON adding confidence - When the local ensemble prediction for "" is "" with confidence - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | number_of_models | data_input |prediction | confidence | - | data/iris.csv | 50 | 50 | 50 | 5 | {"petal width": 0.5} | Iris-versicolor | 0.3687 | - - - # THIS SCENARIO IS NOT NECESSARY IN THE JAVA BINDING BECAUSE WE ONLY CAN CREATE LOCALENSEMBLES - # USING A LIST OF MODEL IDs. THIS IS ENOUGH TO CREATE THE LOCAL ENSEMBLES. - # - # Scenario Outline: Successfully obtaining field importance from an Ensemble created from local models: - # Given I create a data source uploading a "" file - # And I wait until the source is ready less than secs - # And I create a dataset - # And I wait until the dataset is ready less than secs - # And I create a model with "" - # And I wait until the model is ready less than secs - # And I create a model with "" - # And I wait until the model is ready less than secs - # And I create a model with "" - # And I wait until the model is ready less than secs - # When I create a local Ensemble with the last local models - # Then the field importance text is "" - # Then delete test data - # - # Examples: - # | data | time_1 | time_2 |parms1 | time_3 |parms2 | time_4 |parms3| time_5 |number_of_models |field_importance | - # | data/iris.csv | 50 | 50 |{"input_fields": ["000000", "000001","000003", "000004"]} |20 |{"input_fields": ["000000", "000001","000002", "000004"]} | 20 |{"input_fields": ["000000", "000001","000002", "000003", "000004"]} | 20 | 3 |[["000002", 0.5269933333333333], ["000003", 0.38936], ["000000", 0.04662333333333333], ["000001", 0.037026666666666666]] | - - - Scenario Outline: Successfully creating a local prediction from an Ensemble: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble of models - And I wait until the ensemble is ready less than secs - And I create a local ensemble - When the local ensemble prediction using median with confidence for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | number_of_models | data_input |prediction | - | data/grades.csv | 50 | 50 | 50 | 2 | {} | 69.0934 | - - - Scenario Outline: Successfully creating a local prediction from an Ensemble with max models: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble of models - And I wait until the ensemble is ready less than secs - And I create a local ensemble with max models - When the local ensemble prediction for "" is "" - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | number_of_models | max_models | data_input |prediction | - | data/iris.csv | 50 | 50 | 50 | 20 | 5 | 2 | {"petal width": 0.5} | Iris-versicolor | diff --git a/src/test/resources/test_11_multimodel_prediction.feature b/src/test/resources/test_11_multimodel_prediction.feature deleted file mode 100755 index 5987b4a..0000000 --- a/src/test/resources/test_11_multimodel_prediction.feature +++ /dev/null @@ -1,44 +0,0 @@ -Feature: Create Predictions from Multi Models - In order to create a prediction from a multi model - I need to create a multi model first - - Scenario Outline: Successfully creating a prediction from a multi model: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I retrieve a list of remote models tagged with "" - And I create a local multi model - Then the local multi prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | params | tag | data_input | prediction | - | data/iris.csv | 10 | 10 | 10 | {"tags":["mytag"]} | mytag | {"petal width": 0.5} | Iris-setosa | - - - Scenario Outline: Successfully creating a local batch prediction from a multi model: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I create a model with "" - And I wait until the model is ready less than secs - And I retrieve a list of remote models tagged with "" - And I create a local multi model - Then I create a batch multimodel prediction for "" and predictions "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | params | tag | data_inputs | predictions | - | data/iris.csv | 10 | 10 | 10 | {"tags":["mytag"]} | mytag | [{"petal width": 0.5}, {"petal length": 6, "petal width": 2}] | ["Iris-setosa", "Iris-virginica"] | diff --git a/src/test/resources/test_12_public_model_prediction.feature b/src/test/resources/test_12_public_model_prediction.feature deleted file mode 100755 index 4f09de4..0000000 --- a/src/test/resources/test_12_public_model_prediction.feature +++ /dev/null @@ -1,44 +0,0 @@ -Feature: Create Predictions from public Model - In order to create a prediction from a public model - I need to create a public model - - Scenario Outline: Successfully creating a prediction using a public model: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I make the model public - And I wait until the model is ready less than secs - And I check the model status using the model's public url - When I create a prediction for "" - Then the prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | - | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | - - - - Scenario Outline: Successfully creating a prediction using a shared model: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I make the model shared - And I wait until the model is ready less than secs - And I get the model sharing info - And I check the model status using the model's shared url - And I check the model status using the model's shared key - And I create a local model - When I create a prediction for "" - Then the prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | - | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | \ No newline at end of file diff --git a/src/test/resources/test_13_public_dataset.feature b/src/test/resources/test_13_public_dataset.feature deleted file mode 100755 index 46b7a97..0000000 --- a/src/test/resources/test_13_public_dataset.feature +++ /dev/null @@ -1,18 +0,0 @@ -Feature: Create and read a public dataset - In order to read a public dataset - I need to create a public dataset - - Scenario Outline: Successfully creating and reading a public dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I make the dataset public - And I wait until the dataset is ready less than secs - When I get the dataset status using the dataset's public url - Then the dataset's status is FINISHED - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | - | data/iris.csv | 20 | 20 | 20 | diff --git a/src/test/resources/test_15_download_dataset.feature b/src/test/resources/test_15_download_dataset.feature deleted file mode 100755 index e649bcd..0000000 --- a/src/test/resources/test_15_download_dataset.feature +++ /dev/null @@ -1,16 +0,0 @@ -Feature: Create and read a public dataset - In order to read a public dataset - I need to create a public dataset - - Scenario Outline: Successfully exporting a dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I download the dataset file to "" - Then the dataset file "" is like "" - Then delete test data - - Examples: - | data | time_1 | time_2 | local_file | - | data/iris.csv | 30 | 30 | data/exported_iris.csv | diff --git a/src/test/resources/test_17_split_dataset.feature b/src/test/resources/test_17_split_dataset.feature deleted file mode 100755 index ec27e8e..0000000 --- a/src/test/resources/test_17_split_dataset.feature +++ /dev/null @@ -1,18 +0,0 @@ -Feature: Create a split dataset - In order to create a split dataset - I need to create an origin dataset - - Scenario Outline: Successfully creating a split dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a dataset extracting a sample - And I wait until the dataset is ready less than secs - When I compare the datasets' instances - Then the proportion of instances between datasets is - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | rate | - | data/iris.csv | 10 | 10 | 10 | 0.8 | diff --git a/src/test/resources/test_18_create_anomaly.feature b/src/test/resources/test_18_create_anomaly.feature deleted file mode 100755 index 9979c56..0000000 --- a/src/test/resources/test_18_create_anomaly.feature +++ /dev/null @@ -1,41 +0,0 @@ -Feature: Create an anomaly detector from a dataset or dataset list - In order to create an anomaly detector from a list of datasets - I need to create some datasets first - - Scenario Outline: Successfully creating an anomaly detector from a dataset and a dataset list: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - Then I create an anomaly detector from a dataset - And I wait until the anomaly detector is ready less than secs - And I check the anomaly detector stems from the original dataset - And I store the dataset id in a list - And I create a dataset - And I wait until the dataset is ready less than secs - And I store the dataset id in a list - Then I create an anomaly detector from a dataset list - And I wait until the anomaly detector is ready less than secs - And I check the anomaly detector stems from the original dataset list - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | - | data/iris.csv | 40 | 40 | 40 | 100 | - | data/tiny_kdd.csv | 40 | 40 | 40 | 100 | - - - Scenario Outline: Successfully creating an anomaly detector from a dataset and generating the anomalous dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - Then I create an anomaly detector of anomalies from a dataset - And I wait until the anomaly detector is ready less than secs - And I create a dataset with only the anomalies - And I wait until the dataset is ready less than secs - And I check that the dataset has rows - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | rows| - | data/iris.csv | 40 | 40 | 40 | 100 | 1 | diff --git a/src/test/resources/test_19_missing_and_errors.feature b/src/test/resources/test_19_missing_and_errors.feature deleted file mode 100755 index b4a9c96..0000000 --- a/src/test/resources/test_19_missing_and_errors.feature +++ /dev/null @@ -1,30 +0,0 @@ -Feature: Obtain missing values and errors counters - In order to get the missing values and errors - I need to create a dataset first - - Scenario Outline: Successfully obtaining missing values counts: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I update the source with "" waiting less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - When I ask for the missing values counts in the fields - Then the missing values counts dict is "" - - Examples: - | data | time_1 | params | time_2 | missing_values | - | data/iris_missing.csv | 30 | {"fields": {"000000": {"optype": "numeric"}}} |30 | {"000000": 1} | - - - Scenario Outline: Successfully obtaining parsing error counts: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I update the source with "" waiting less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - When I ask for the error counts in the fields - Then the error counts dict is "" - - Examples: - | data | time_1 | params | time_2 |error_values | - | data/iris_missing.csv | 30 | {"fields": {"000000": {"optype": "numeric"}}} |30 |{"000000": 1} | diff --git a/src/test/resources/test_20_rename_duplicated_names.feature b/src/test/resources/test_20_rename_duplicated_names.feature deleted file mode 100755 index a174118..0000000 --- a/src/test/resources/test_20_rename_duplicated_names.feature +++ /dev/null @@ -1,20 +0,0 @@ -Feature: Rename duplicated names - In order rename the duplicated field names - I need to create a model first - Then I need to create a local model - - Scenario Outline: Successfully changing duplicated field names: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset with "" - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - Then "" field's name is changed to "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | options | field_id | new_name | - | data/iris.csv | 20 | 20 | 30 | {"tags": ["unitTest"], "fields": {"000001": {"name": "species"}}} | 000001 | species1 | - | data/iris.csv | 20 | 20 | 30 | {"tags": ["unitTest"], "fields": {"000001": {"name": "petal width"}}} | 000001 | petal width3 | diff --git a/src/test/resources/test_24_cluster_derived.feature b/src/test/resources/test_24_cluster_derived.feature deleted file mode 100755 index 6c6a576..0000000 --- a/src/test/resources/test_24_cluster_derived.feature +++ /dev/null @@ -1,51 +0,0 @@ -Feature: Clusters - - Scenario Outline: Successfully creating datasets for first centroid of a cluster: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a cluster - And I wait until the cluster is ready less than secs - When I create a dataset associated to centroid "" - And I wait until the dataset is ready less than secs - Then the dataset is associated to the centroid "" of the cluster - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | centroid_id | - | data/iris.csv | 10 | 10 | 40 | 10 | 000001 | - - - Scenario Outline: Successfully creating models for first centroid of a cluster: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I wait until the dataset is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a cluster with options "" - And I wait until the cluster is ready less than secs - When I create a model associated to centroid "" - And I wait until the model is ready less than secs - - Then the model is associated to the centroid "" of the cluster - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | options | centroid_id | - | data/iris.csv | 10 | 10 | 40 | 10 | {"model_clusters": true} | 000001 | - - - Scenario Outline: Successfully getting the closest point in a cluster - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a cluster - And I wait until the cluster is ready less than secs - And I create a local cluster - Then the data point in the cluster closest to "" is "" - Then the data point in the cluster closest to "" is "" - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | reference | closest | - | data/iris.csv | 10 | 10 | 40 | 20 | {"petal length": 1.4, "petal width": 0.2, "sepal width": 3.0, "sepal length": 4.89, "species": "Iris-setosa"} | {"distance": 0.001894153207990619, "data": {"petal length": 1.4, "petal width": 0.2, "sepal width": 3.0, "sepal length": 4.9, "species": "Iris-setosa"}} | - diff --git a/src/test/resources/test_27_logistic_regression.feature b/src/test/resources/test_27_logistic_regression.feature deleted file mode 100755 index e5b8829..0000000 --- a/src/test/resources/test_27_logistic_regression.feature +++ /dev/null @@ -1,17 +0,0 @@ -Feature: Testing logisticregressions REST api calls - - Scenario Outline: Successfully creating a logisticregression from a dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a logisticregression from a dataset - And I wait until the logisticregression is ready less than secs - And I update the logisticregression name to "" - When I wait until the logisticregression is ready less than secs - Then the logisticregression name is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | logisticregression_name | - | data/iris.csv | 10 | 10 | 20 | 20 | my new logisticregression name | diff --git a/src/test/resources/test_29_script.feature b/src/test/resources/test_29_script.feature deleted file mode 100755 index 2a17f56..0000000 --- a/src/test/resources/test_29_script.feature +++ /dev/null @@ -1,13 +0,0 @@ -Feature: Testing Whizzml Script REST api calls - - Scenario Outline: Scenario: Successfully creating a whizzml library: - Given I create a whizzml script from a excerpt of code "" - And I wait until the script is ready less than secs - And I update the script name to "" - When I wait until the script is ready less than secs - Then the script name is "" - Then the script code is "" - - Examples: - | source_code | time_1 | time_2 | script_name | - | (+ 1 1) | 10 | 10 | my script | diff --git a/src/test/resources/test_31_library.feature b/src/test/resources/test_31_library.feature deleted file mode 100755 index c17e07c..0000000 --- a/src/test/resources/test_31_library.feature +++ /dev/null @@ -1,13 +0,0 @@ -Feature: Testing Whizzml Library REST api calls - - Scenario Outline: Scenario: Successfully creating a whizzml library: - Given I create a whizzml library from a excerpt of code "" - And I wait until the library is ready less than secs - And I update the library name to "" - When I wait until the library is ready less than secs - Then the library name is "" - Then the library code is "" - - Examples: - | source_code | time_1 | time_2 | library_name | - | (define (mu x) (+ x 1)) | 10 | 10 | my library | diff --git a/src/test/resources/test_32_topic_model_prediction.feature b/src/test/resources/test_32_topic_model_prediction.feature deleted file mode 100755 index 3f50031..0000000 --- a/src/test/resources/test_32_topic_model_prediction.feature +++ /dev/null @@ -1,29 +0,0 @@ -Feature: Create Topic Distributions - In order to create a topic distribution - I need to create a topic model first - - #Scenario Outline: Successfully creating a local Topic Distribution - # Given I have a block of text and a topic model - # And I use the model to predict the topic distribution - # Then the value of the distribution matches the expected distribution - # - # Examples: - # | model | text | expected_distribution | - # | {...} | "hello, world!" | [0.5, 0.3, 0.2] | - - - Scenario Outline: Successfully creating Topic Model from a dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I update the source with "" waiting less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create topic model from a dataset - And I wait until the topic model is ready less than secs - And I update the topic model name to "" - When I wait until the topic model is ready less than secs - Then the topic model name is "" - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | topic_model_name | params | - | data/spam.csv | 100 | 100 | 200 | 500 | my new topic model name | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} | diff --git a/src/test/resources/test_34_timeseries.feature b/src/test/resources/test_34_timeseries.feature deleted file mode 100755 index 03e1590..0000000 --- a/src/test/resources/test_34_timeseries.feature +++ /dev/null @@ -1,18 +0,0 @@ -Feature: Create TimeSeries - - Scenario Outline: Successfully creating forecasts from a dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a time series from a dataset - And I wait until the time series is ready less than secs - And I update the time series name to "" - When I wait until the time series is ready less than secs - Then the time series name is "" - And I create a forecast for "" - Then the forecasts are "" - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | time_series_name | input_data | forecast_points | - | data/grades.csv | 100 | 100 | 200 | 500 | my new time series name | {"000005": {"horizon": 5}} | {"000005": [{"point_forecast": [73.96192, 74.04106, 74.12029, 74.1996, 74.27899], "model": "M,M,N"}]} | diff --git a/src/test/resources/test_36_compare_predictions.feature b/src/test/resources/test_36_compare_predictions.feature deleted file mode 100755 index 85a3bf6..0000000 --- a/src/test/resources/test_36_compare_predictions.feature +++ /dev/null @@ -1,200 +0,0 @@ -Feature: Testing Deepnet REST api calls - In order to create an deepnet - I need to create a dataset first - - Scenario Outline: Successfully comparing predictions for models with operating point - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - When I create a prediction with model with operating point "" for "" - Then the prediction for "" is "" - When I create a local prediction with model with operating point "" for "" - Then the local model prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_point | - | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-setosa | {"kind": "probability", "threshold": 0.1, "positive_class": "Iris-setosa"} | - | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-versicolor | {"kind": "probability", "threshold": 0.9, "positive_class": "Iris-setosa"} | - | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4} | 000004 | Iris-setosa | {"kind": "confidence", "threshold": 0.1, "positive_class": "Iris-setosa"} | - | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4}| 000004 | Iris-versicolor | {"kind": "confidence", "threshold": 0.9, "positive_class": "Iris-setosa"} | - - - Scenario Outline: Successfully comparing predictions for models with operating kind - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - When I create a prediction with model with operating kind "" for "" - Then the prediction for "" is "" - When I create a local prediction with model with operating kind "" for "" - Then the local model prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_kind | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46, "sepal length": 5} | 000004 | Iris-versicolor | probability | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46, "sepal length": 5} | 000004 | Iris-versicolor | confidence | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | probability | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | confidence | - - - Scenario Outline: Successfully comparing predictions for deepnets - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a deepnet with objective "" and params "" - And I wait until the deepnet is ready less than secs - And I create a local deepnet - When I create a deepnet prediction for "" - Then the deepnet prediction for objective "" is "" - And I create a local deepnet prediction for "" - Then the local deepnet prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | - | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-virginica | {} | - | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4} | 000004 | Iris-setosa | {} | - | data/iris_missing2.csv | 50 | 50 | 30000 | {} | 000004 | Iris-setosa | {} | - | data/grades.csv | 50 | 50 | 30000 | {} | 000005 | 42.15473 | {} | - | data/spam.csv | 50 | 50 | 30000 | {} | 000000 | ham | {} | - | data/diabetes.csv | 50 | 50 | 30000 | {} | 000008 | false | {"search": true, "number_of_model_candidates": 10, "max_training_time": 600} | - | data/diabetes.csv | 50 | 50 | 30000 | {} | 000008 | false | {"learn_residuals": true, "number_of_model_candidates": 10, "max_training_time": 600} | - - - Scenario Outline: Successfully comparing predictions for deepnets with operating point - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a deepnet with objective "" and params "" - And I wait until the deepnet is ready less than secs - And I create a local deepnet - When I create a prediction with deepnet with operating point "" for "" - Then the prediction for "" is "" - When I create a local prediction with deepnet with operating point "" for "" - Then the local deepnet prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | operating_point | - | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-versicolor | {} | {"kind": "probability", "threshold": 1, "positive_class": "Iris-virginica"} | - - - Scenario Outline: Successfully comparing predictions for deepnets with operating kind - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a deepnet with objective "" and params "" - And I wait until the deepnet is ready less than secs - And I create a local deepnet - When I create a prediction with deepnet with operating kind "" for "" - Then the prediction for "" is "" - When I create a local prediction with deepnet with operating kind "" for "" - Then the local deepnet prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | operating_kind | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-setosa | {} | probability | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | {} | probability | - - - Scenario Outline: Successfully comparing predictions for ensembles with operating point - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble - And I wait until the ensemble is ready less than secs - And I create a local ensemble - When I create a prediction with ensemble with operating point "" for "" - Then the prediction for "" is "" - When I create a local prediction with ensemble with operating point "" for "" - Then the local ensemble prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_point | - | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-setosa | {"kind": "probability", "threshold": 0.1, "positive_class": "Iris-setosa"} | - | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-virginica | {"kind": "probability", "threshold": 0.9, "positive_class": "Iris-setosa"} | - | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4} | 000004 | Iris-setosa | {"kind": "confidence", "threshold": 0.1, "positive_class": "Iris-setosa"} | - | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4}| 000004 | Iris-versicolor | {"kind": "confidence", "threshold": 0.9, "positive_class": "Iris-setosa"} | - - - Scenario Outline: Successfully comparing predictions for ensembles with operating kind - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create an ensemble - And I wait until the ensemble is ready less than secs - And I create a local ensemble - When I create a prediction with ensemble with operating kind "" for "" - Then the prediction for "" is "" - When I create a local prediction with ensemble with operating kind "" for "" - Then the local ensemble prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_kind | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-versicolor | probability | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | probability | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-versicolor | confidence | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | confidence | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-versicolor | votes | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | votes | - - - Scenario Outline: Successfully comparing predictions for logistic regressions with operating kind - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a logisticregression from a dataset - And I wait until the logisticregression is ready less than secs - And I create a local logisticregression - When I create a prediction with logisticregression with operating kind "" for "" - Then the prediction for "" is "" - When I create a local prediction with logisticregression with operating kind "" for "" - Then the local logisticregression prediction is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_kind | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 5} | 000004 | Iris-versicolor | probability | - | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | probability | - - - Scenario Outline: Successfully comparing predictions for linear regression - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a linearregression with objective "" and params "" - And I wait until the linearregression is ready less than secs - When I create a linearregression prediction for "" - Then the linearregression prediction is "" - And I create a local linearregression - And I create a local linearregression prediction for "" - Then the local linearregression prediction is "" - - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | - | data/grades.csv | 50 | 50 | 30000 | {"000000": 1, "000001": 1, "000002": 1} | 000005 | 29.63024 | {"input_fields": ["000000", "000001", "000002"]} | - | data/iris.csv | 50 | 50 | 30000 | {"000000": 1, "000001": 1, "000004": "Iris-virginica"} | 000003 | 1.21187 | {"input_fields": ["000000", "000001", "000004"]} | - | data/movies.csv | 50 | 50 | 30000 | {"000007": "Action"} | 000009 | 4.33333 | {"input_fields": ["000007"]} | - - \ No newline at end of file diff --git a/src/test/resources/test_42_pca.feature b/src/test/resources/test_42_pca.feature deleted file mode 100755 index 1dfbe7d..0000000 --- a/src/test/resources/test_42_pca.feature +++ /dev/null @@ -1,54 +0,0 @@ -Feature: Testing REST api calls - - Scenario Outline: Successfully creating a PCA from a dataset: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a pca from a dataset - And I wait until the pca is ready less than secs - And I update the pca name to "" - When I wait until the pca is ready less than secs - Then the pca name is "" - Then I delete the pca - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | pca_name | - | data/iris.csv | 50 | 50 | 100 | 100 | my new pca name | - - - Scenario Outline: Successfully creating a projection: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a pca from a dataset - And I wait until the pca is ready less than secs - When I create a projection for "" - Then the projection is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | data_input | projection | - | data/iris.csv | 50 | 50 | 100 | 100 | {"petal width": 0.5} | {"PC2": 0.1593, "PC3": -0.01286, "PC1": 0.91648, "PC6": 0.27284, "PC4": 1.29255, "PC5": 0.75196} | - - - Scenario Outline: Successfully creating a batch projection: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a pca from a dataset - And I wait until the pca is ready less than secs - - When I create a batch projection for the dataset with the pca - And I wait until the batch projection is ready less than secs - - And I download the created projections file to "" - Then the batch projection file is like "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | local_file | projections_file | - | data/iris.csv | 30 | 30 | 50 | 50 | data/batch_projections.csv | data/batch_projections.csv | \ No newline at end of file diff --git a/src/test/resources/test_anomaly.feature b/src/test/resources/test_anomaly.feature new file mode 100755 index 0000000..b548503 --- /dev/null +++ b/src/test/resources/test_anomaly.feature @@ -0,0 +1,75 @@ +Feature: Anomaly + + Scenario Outline: Successfully creating an anomaly detector from a dataset and a dataset list: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + Then I create an anomaly detector from a dataset + And I wait until the anomaly detector is ready less than secs + And I check the anomaly detector stems from the original dataset + And I store the dataset id in a list + And I create a dataset + And I wait until the dataset is ready less than secs + And I store the dataset id in a list + Then I create an anomaly detector from a dataset list + And I wait until the anomaly detector is ready less than secs + And I check the anomaly detector stems from the original dataset list + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | + | data/iris.csv | 40 | 40 | 40 | 100 | + | data/tiny_kdd.csv | 40 | 40 | 40 | 100 | + + + Scenario Outline: Successfully creating an anomaly detector from a dataset and generating the anomalous dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + Then I create an anomaly detector of anomalies from a dataset + And I wait until the anomaly detector is ready less than secs + And I create a dataset with only the anomalies + And I wait until the dataset is ready less than secs + And I check that the dataset has rows + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | rows| + | data/iris.csv | 40 | 40 | 40 | 100 | 1 | + + + Scenario Outline: Successfully creating an anomaly score: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an anomaly detector from a dataset + And I wait until the anomaly detector is ready less than secs + When I create an anomaly score for "" + Then the anomaly score is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | score | + | data/tiny_kdd.csv | 10 | 10 | 100 | {"src_bytes": 350} | 0.92846 | + | data/iris_sp_chars.csv | 10 | 10 | 100 | {"pétal&width\u0000": 300} | 0.89313 | + + + Scenario Outline: Successfully comparing scores from anomaly detectors: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an anomaly detector from a dataset + And I wait until the anomaly detector is ready less than secs + And I create a local anomaly detector + When I create an anomaly score for "" + Then the anomaly score is "" + And I create a local anomaly score for "" + Then the local anomaly score is + + Examples: + | data | time_1 | time_2 | time_3 | data_input | score | + | data/tiny_kdd.csv | 20 | 20 | 30 | {"000020": 255.0, "000004": 183.0, "000016": 4.0, "000024": 0.04, "000025": 0.01, "000026": 0.0, "000019": 0.25, "000017": 4.0, "000018": 0.25, "00001e": 0.0, "000005": 8654.0, "000009": "0", "000023": 0.01, "00001f": 123.0} | 0.69802 | + \ No newline at end of file diff --git a/src/test/resources/test_28_associations.feature b/src/test/resources/test_association.feature similarity index 95% rename from src/test/resources/test_28_associations.feature rename to src/test/resources/test_association.feature index 1047b32..ada100e 100755 --- a/src/test/resources/test_28_associations.feature +++ b/src/test/resources/test_association.feature @@ -1,6 +1,4 @@ -Feature: Testing Assocaitions REST api calls - In order to create an association - I need to create a dataset first +Feature: Assocaition Scenario Outline: Successfully creating associations from a dataset: Given I create a data source uploading a "" file diff --git a/src/test/resources/test_06_batch_predictions.feature b/src/test/resources/test_batchpredictions.feature similarity index 52% rename from src/test/resources/test_06_batch_predictions.feature rename to src/test/resources/test_batchpredictions.feature index ec48929..b0dc747 100755 --- a/src/test/resources/test_06_batch_predictions.feature +++ b/src/test/resources/test_batchpredictions.feature @@ -1,8 +1,6 @@ -Feature: Create Batch Predictions - In order to create a batch prediction - I need to create a model and a dataset first - - Scenario Outline: Successfully creating a batch prediction: +Feature: Batch Predictions + + Scenario Outline: Successfully creating a batch prediction for a model: Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset @@ -19,8 +17,53 @@ Feature: Create Batch Predictions Examples: | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | | data/iris.csv | 30 | 30 | 50 | 50 | data/downloaded_batch_predictions.csv | data/batch_predictions.csv | + + + Scenario Outline: Successfully creating a batch prediction from a multi model: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I retrieve a list of remote models tagged with "" + And I create a local multi model + When I create a batch prediction for "" and save it in "" + And I combine the votes in "" + Then the plurality combined predictions are "" + And the confidence weighted predictions are "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | params | tag | data_input | path | predictions | + | data/iris.csv | 10 | 10 | 10 | {"tags":["mytag"]} | mytag | [{"petal width": 0.5}, {"petal length": 6, "petal width": 2}, {"petal length": 4, "petal width": 1.5}] | data | ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] | + + + + Scenario Outline: Successfully creating a source from a batch prediction: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + When I create a batch prediction for the dataset with the model + And I wait until the batch prediction is ready less than secs + Then I create a source from the batch prediction + And I wait until the source is ready less than secs + Then delete test data - Scenario Outline: Successfully creating a batch prediction for an ensemble: + Examples: + | data | time_1 | time_2 | time_3 | time_4 | + | data/iris.csv | 30 | 30 | 50 | 50 | + | data/diabetes.csv | 30 | 30 | 50 | 50 | + + + Scenario Outline: Successfully creating a batch prediction for an ensemble: Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset @@ -36,45 +79,67 @@ Feature: Create Batch Predictions Examples: | data | time_1 | time_2 | number_of_models | time_3 | time_4 | local_file | predictions_file | | data/iris.csv | 30 | 30 | 5 | 80 | 50 | data/downloaded_batch_predictions_e.csv | data/batch_predictions_e.csv | - - - Scenario Outline: Successfully creating a batch centroid from a cluster: + + + Scenario Outline: Successfully creating a batch prediction for a logistic regression: Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset And I wait until the dataset is ready less than secs - And I create a cluster - And I wait until the cluster is ready less than secs - When I create a batch centroid for the dataset - And I wait until the batch centroid is ready less than secs - And I download the created centroid file to "" - Then the batch centroid file is like "" - Then delete test data + And I create a logisticregression from a dataset + And I wait until the logisticregression is ready less than secs + When I create a batch prediction for the dataset with the logisticregression + And I wait until the batch prediction is ready less than secs + And I download the created predictions file to "" + Then the batch prediction file "" is like "" Examples: - | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | - | data/diabetes.csv | 50 | 50 | 50 | 50 | data/downloaded_batch_predictions_c.csv |data/batch_predictions_c.csv | - + | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | + | data/iris.csv | 30 | 30 | 50 | 50 | data/downloaded_batch_predictions_lr.csv | data/batch_predictions_lr.csv | + + + Scenario Outline: Successfully creating a batch prediction for a linear regression: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a linearregression from a dataset + And I wait until the linearregression is ready less than secs + When I create a batch prediction for the dataset with the linearregression + And I wait until the batch prediction is ready less than secs + And I download the created predictions file to "" + Then the batch prediction file "" is like "" - Scenario Outline: Successfully creating a source from a batch prediction: + Examples: + | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | + | data/grades.csv | 30 | 30 | 50 | 50 | data/downloaded_batch_predictions_linear.csv | data/batch_predictions_linear.csv | + + + Scenario Outline: Successfully creating a fusion Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset And I wait until the dataset is ready less than secs And I create a model And I wait until the model is ready less than secs - When I create a batch prediction for the dataset with the model + And I create a model + And I wait until the model is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a fusion from models + And I wait until the fusion is ready less than secs + When I create a batch prediction for the dataset with the fusion And I wait until the batch prediction is ready less than secs - Then I create a source from the batch prediction - And I wait until the source is ready less than secs + And I download the created predictions file to "" + Then the batch prediction file "" is like "" Then delete test data - + Examples: - | data | time_1 | time_2 | time_3 | time_4 | - | data/iris.csv | 30 | 30 | 50 | 50 | - | data/diabetes.csv | 30 | 30 | 50 | 50 | - - + | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | + | data/iris.csv | 50 | 50 | 50 | 50 | data/downloaded_batch_predictions.csv | data/batch_predictions_fs.csv | + + + Scenario Outline: Successfully creating a batch anomaly score from an anomaly detector: Given I create a data source uploading a "" file And I wait until the source is ready less than secs @@ -91,20 +156,46 @@ Feature: Create Batch Predictions Examples: | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | | data/tiny_kdd.csv | 30 | 30 | 50 | 50 | data/downloaded_batch_predictions_a.csv | data/batch_predictions_a.csv | + + + Scenario Outline: Successfully creating a batch centroid from a cluster: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a cluster + And I wait until the cluster is ready less than secs + When I create a batch centroid for the dataset + And I wait until the batch centroid is ready less than secs + + + + And I download the created centroid file to "" + Then the batch centroid file is like "" + Then delete test data - - Scenario Outline: Successfully creating a batch prediction for a logistic regression: + Examples: + | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | + | data/diabetes.csv | 50 | 50 | 50 | 50 | data/downloaded_batch_predictions_c.csv |data/batch_predictions_c.csv | + + + + Scenario Outline: Successfully creating a batch projection: Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset And I wait until the dataset is ready less than secs - And I create a logisticregression from a dataset - And I wait until the logisticregression is ready less than secs - When I create a batch prediction for the dataset with the logistic regression - And I wait until the batch prediction is ready less than secs - And I download the created predictions file to "" - Then the batch prediction file "" is like "" + And I create a pca from a dataset + And I wait until the pca is ready less than secs + + When I create a batch projection for the dataset with the pca + And I wait until the batch projection is ready less than secs + + And I download the created projections file to "" + Then the batch projection file is like "" + Then delete test data Examples: - | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | - | data/iris.csv | 30 | 30 | 50 | 50 | data/downloaded_batch_predictions_lr.csv | data/batch_predictions_lr.csv | + | data | time_1 | time_2 | time_3 | time_4 | local_file | projections_file | + | data/iris.csv | 30 | 30 | 50 | 50 | data/batch_projections.csv | data/batch_projections.csv | + \ No newline at end of file diff --git a/src/test/resources/test_33_compare_predictions.feature b/src/test/resources/test_cluster.feature similarity index 54% rename from src/test/resources/test_33_compare_predictions.feature rename to src/test/resources/test_cluster.feature index 875d605..92ce8db 100755 --- a/src/test/resources/test_33_compare_predictions.feature +++ b/src/test/resources/test_cluster.feature @@ -1,6 +1,26 @@ -Feature: Compare predictions +Feature: Clusters + + Scenario Outline: Successfully creating a centroid and the associated dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a cluster + And I wait until the cluster is ready less than secs + When I create a centroid for "" + And I check the centroid is ok + Then the centroid is "" + And I create a dataset from the cluster and the centroid + And I wait until the dataset is ready less than secs + And I check that the dataset is created for the cluster and the centroid + Then delete test data - Scenario Outline: Successfully comparing centroids with or without text options: + Examples: + | data | time_1 | time_2 | time_3 | data_input | centroid | + | data/diabetes.csv | 10 | 10 | 30 | {"pregnancies": 0, "plasma glucose": 118, "blood pressure": 84, "triceps skin thickness": 47, "insulin": 230, "bmi": 45.8, "diabetes pedigree": 0.551, "age": 31, "diabetes": "true"} | Cluster 3 | + + + Scenario Outline: Successfully comparing centroids with or without text options: Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I update the source with "" waiting less than secs @@ -49,91 +69,54 @@ Feature: Compare predictions | data | time_1 | time_2 | time_3 | options | data_input | centroid | distance | | data/iris.csv | 20 | 20 | 30 | {"summary_fields": ["sepal width"]} |{"petal length": 1, "petal width": 1, "sepal length": 1, "species": "Iris-setosa"} | Cluster 2 | 1.16436 | # | data/iris.csv | 20 | 20 | 30 | {"default_numeric_value": "zero"} |{"petal length": 1} | Cluster 4 | 1.41215 | - - - Scenario Outline: Successfully comparing scores from anomaly detectors: - Given I create a data source uploading a "" file + + + Scenario Outline: Successfully creating datasets for first centroid of a cluster: + Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset And I wait until the dataset is ready less than secs - And I create an anomaly detector from a dataset - And I wait until the anomaly detector is ready less than secs - And I create a local anomaly detector - When I create an anomaly score for "" - Then the anomaly score is "" - And I create a local anomaly score for "" - Then the local anomaly score is - - Examples: - | data | time_1 | time_2 | time_3 | data_input | score | - | data/tiny_kdd.csv | 20 | 20 | 30 | {"000020": 255.0, "000004": 183.0, "000016": 4.0, "000024": 0.04, "000025": 0.01, "000026": 0.0, "000019": 0.25, "000017": 4.0, "000018": 0.25, "00001e": 0.0, "000005": 8654.0, "000009": "0", "000023": 0.01, "00001f": 123.0} | 0.69802 | - - - Scenario Outline: Successfully comparing topic distributions: + And I create a cluster + And I wait until the cluster is ready less than secs + When I create a dataset associated to centroid "" + And I wait until the dataset is ready less than secs + Then the dataset is associated to the centroid "" of the cluster + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | centroid_id | + | data/iris.csv | 10 | 10 | 40 | 10 | 000001 | + + + Scenario Outline: Successfully creating models for first centroid of a cluster: Given I create a data source uploading a "" file And I wait until the source is ready less than secs - And I update the source with "" waiting less than secs - And I create a dataset And I wait until the dataset is ready less than secs - And I create topic model from a dataset - And I wait until the topic model is ready less than secs - - And I create a local topic model - When I create a local topic distribution for "" - Then the local topic distribution is "" - - When I create a topic distribution for "" - Then the topic distribution is "" - - Examples: - | data | time_1 | time_2 | time_3 | options | data_input | topic_distribution | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} | {"Type": "ham", "Message": "Mobile call"} | [0.51133, 0.00388, 0.00574, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.44801] | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} | {"Type": "ham", "Message": "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."} | [0.39188, 0.00643, 0.00264, 0.00643, 0.08112, 0.00264, 0.37352, 0.0115, 0.00707, 0.00327, 0.00264, 0.11086] | - - - - - Scenario Outline: Successfully comparing predictions for ensembles - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs And I create a dataset And I wait until the dataset is ready less than secs - And I create an ensemble with "" - And I wait until the ensemble is ready less than secs - And I create a local ensemble - When I create a prediction with ensemble for "" - Then the prediction for "" is "" - When the local ensemble prediction for "" is "" + And I create a cluster with options "" + And I wait until the cluster is ready less than secs + When I create a model associated to centroid "" + And I wait until the model is ready less than secs + + Then the model is associated to the centroid "" of the cluster - Examples: - | data | time_1 | time_2 | time_3 | params | data_input | objective | prediction | - | data/iris_unbalanced.csv | 30 | 30 | 120 | {"boosting": {"iterations": 5}, "number_of_models": 5} |{"petal width": 4} | 000004 | Iris-virginica | - | data/grades.csv | 30 | 30 | 120 | {"boosting": {"iterations": 5}, "number_of_models": 5} |{"Midterm": 20} | 000005 | 61.61036 | - - - Scenario Outline: Successfully comparing predictions for ensembles with proportional missing strategy + Examples: + | data | time_1 | time_2 | time_3 | time_4 | options | centroid_id | + | data/iris.csv | 10 | 10 | 40 | 10 | {"model_clusters": true} | 000001 | + + + Scenario Outline: Successfully getting the closest point in a cluster Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset And I wait until the dataset is ready less than secs - And I create an ensemble with "" - And I wait until the ensemble is ready less than secs - And I create a local ensemble - When I create a proportional missing strategy prediction with ensemble with "" for "" - Then the prediction for "" is "" - And the confidence for the prediction is - And I create a proportional missing strategy local prediction with ensemble with "" for "" - Then the local ensemble prediction is "" - And the local ensemble confidence is - Then delete test data - + And I create a cluster + And I wait until the cluster is ready less than secs + And I create a local cluster + Then the data point in the cluster closest to "" is "" + Then the data point in the cluster closest to "" is "" + Examples: - | data | time_1 | time_2 | time_3 | params | data_input | objective | prediction | options | confidence | - | data/iris.csv | 30 | 30 | 120 | {"boosting": {"iterations": 5}} | {} | 000004 | Iris-virginica | {} | 0.33784 | - | data/iris.csv | 30 | 30 | 120 | {"number_of_models": 5} | {} | 000004 | Iris-versicolor | {"operating_kind": "confidence"} | 0.2923 | - | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {} | 000005 | 70.50579 | {} | 30.7161 | - | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {"Midterm": 20} | 000005 | 54.82214 | {"operating_kind": "confidence"} | 25.89672 | - | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {"Midterm": 20} | 000005 | 45.4573 | {} | 29.58403 | - | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {"Midterm": 20, "Tutorial": 90, "TakeHome": 100} | 000005 | 42.814 | {} | 31.51804 | - - \ No newline at end of file + | data | time_1 | time_2 | time_3 | time_4 | reference | closest | + | data/iris.csv | 10 | 10 | 40 | 20 | {"petal length": 1.4, "petal width": 0.2, "sepal width": 3.0, "sepal length": 4.89, "species": "Iris-setosa"} | {"distance": 0.001894153207990619, "data": {"petal length": 1.4, "petal width": 0.2, "sepal width": 3.0, "sepal length": 4.9, "species": "Iris-setosa"}} | + diff --git a/src/test/resources/test_configurations.feature b/src/test/resources/test_configurations.feature index ae275aa..fc18940 100755 --- a/src/test/resources/test_configurations.feature +++ b/src/test/resources/test_configurations.feature @@ -1,7 +1,5 @@ Feature: Testing Configuration REST api calls - In order to test the Configuration API - I need to create a configuration - + Scenario Outline: Successfully creating a configuration: Given I create a configuration with "" And I wait until the configuration is ready less than secs diff --git a/src/test/resources/test_25_correlation.feature b/src/test/resources/test_correlation.feature similarity index 100% rename from src/test/resources/test_25_correlation.feature rename to src/test/resources/test_correlation.feature diff --git a/src/test/resources/test_dataset.feature b/src/test/resources/test_dataset.feature new file mode 100755 index 0000000..4f2aa7c --- /dev/null +++ b/src/test/resources/test_dataset.feature @@ -0,0 +1,76 @@ +Feature: Dataset REST api calls + + Scenario Outline: Successfully creating and reading a public dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I make the dataset public + And I wait until the dataset is ready less than secs + When I get the dataset status using the dataset's public url + Then the dataset's status is FINISHED + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | + | data/iris.csv | 20 | 20 | 20 | + + + Scenario Outline: Successfully exporting a dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I download the dataset file to "" + Then the dataset file "" is like "" + Then delete test data + + Examples: + | data | time_1 | time_2 | local_file | + | data/iris.csv | 30 | 30 | data/exported_iris.csv | + + + Scenario Outline: Successfully creating a split dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a dataset extracting a sample + And I wait until the dataset is ready less than secs + When I compare the datasets' instances + Then the proportion of instances between datasets is + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | rate | + | data/iris.csv | 10 | 10 | 10 | 0.8 | + + + Scenario Outline: Successfully obtaining missing values counts: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I update the source with "" waiting less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + When I ask for the missing values counts in the fields + Then the missing values counts dict is "" + + Examples: + | data | time_1 | params | time_2 | missing_values | + | data/iris_missing.csv | 30 | {"fields": {"000000": {"optype": "numeric"}}} |30 | {"000000": 1} | + + + Scenario Outline: Successfully obtaining parsing error counts: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I update the source with "" waiting less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + When I ask for the error counts in the fields + Then the error counts dict is "" + + Examples: + | data | time_1 | params | time_2 |error_values | + | data/iris_missing.csv | 30 | {"fields": {"000000": {"optype": "numeric"}}} |30 |{"000000": 1} | + + diff --git a/src/test/resources/test_deepnet.feature b/src/test/resources/test_deepnet.feature new file mode 100755 index 0000000..67db295 --- /dev/null +++ b/src/test/resources/test_deepnet.feature @@ -0,0 +1,64 @@ +Feature: Deepnet + + Scenario Outline: Successfully comparing predictions for deepnets + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a deepnet with objective "" and params "" + And I wait until the deepnet is ready less than secs + And I create a local deepnet + When I create a deepnet prediction for "" + Then the deepnet prediction for objective "" is "" + And I create a local deepnet prediction for "" + Then the local deepnet prediction is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | + | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-virginica | {} | + | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4} | 000004 | Iris-setosa | {} | + | data/iris_missing2.csv | 50 | 50 | 30000 | {} | 000004 | Iris-setosa | {} | + | data/grades.csv | 50 | 50 | 30000 | {} | 000005 | 42.15473 | {} | + | data/spam.csv | 50 | 50 | 30000 | {} | 000000 | ham | {} | + | data/diabetes.csv | 50 | 50 | 30000 | {} | 000008 | false | {"search": true, "number_of_model_candidates": 10, "max_training_time": 600} | + | data/diabetes.csv | 50 | 50 | 30000 | {} | 000008 | false | {"learn_residuals": true, "number_of_model_candidates": 10, "max_training_time": 600} | + + + Scenario Outline: Successfully comparing predictions for deepnets with operating point + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a deepnet with objective "" and params "" + And I wait until the deepnet is ready less than secs + And I create a local deepnet + When I create a prediction with deepnet with operating point "" for "" + Then the prediction for "" is "" + When I create a local prediction with deepnet with operating point "" for "" + Then the local deepnet prediction is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | operating_point | + | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-versicolor | {} | {"kind": "probability", "threshold": 1, "positive_class": "Iris-virginica"} | + + + Scenario Outline: Successfully comparing predictions for deepnets with operating kind + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a deepnet with objective "" and params "" + And I wait until the deepnet is ready less than secs + And I create a local deepnet + When I create a prediction with deepnet with operating kind "" for "" + Then the prediction for "" is "" + When I create a local prediction with deepnet with operating kind "" for "" + Then the local deepnet prediction is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | operating_kind | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-setosa | {} | probability | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | {} | probability | diff --git a/src/test/resources/test_ensemble.feature b/src/test/resources/test_ensemble.feature new file mode 100755 index 0000000..489b292 --- /dev/null +++ b/src/test/resources/test_ensemble.feature @@ -0,0 +1,211 @@ +Feature: Ensemble + + Scenario Outline: Successfully creating a prediction from an ensemble: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble of models + And I wait until the ensemble is ready less than secs + When I create a prediction with ensemble for "" + And I wait until the prediction is ready less than secs + Then the prediction for "" is "" + Then delete test data + + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | number_of_models | data_input | objective | prediction | + | data/iris.csv | 10 | 10 | 100 | 20 | 5 | {"petal width": 0.5} | 000004 | Iris-versicolor | + | data/iris_sp_chars.csv | 10 | 10 | 100 | 20 | 5 | {"pétal&width\u0000": 0.5} | 000004 | Iris-versicolor | + + + Scenario Outline: Successfully creating a prediction from an ensemble: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble of models + And I wait until the ensemble is ready less than secs + When I create a prediction with ensemble for "" + And I wait until the prediction is ready less than secs + Then the numerical prediction for "" is + Then delete test data + + | data/grades.csv | 10 | 10 | 150 | 20 | 10 |{"Assignment": 81.22, "Tutorial": 91.95, "Midterm": 79.38, "TakeHome": 105.93} | 000005 | 84.556 | + | data/grades.csv | 10 | 10 | 150 | 20 | 10 |{"Assignment": 97.33, "Tutorial": 106.74, "Midterm": 76.88, "TakeHome": 108.89} | 000005 | 73.13558 | + + + Scenario Outline: Successfully comparing predictions for ensembles + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble with "" + And I wait until the ensemble is ready less than secs + And I create a local ensemble + When I create a prediction with ensemble for "" + Then the prediction for "" is "" + When the local ensemble prediction for "" is "" + + Examples: + | data | time_1 | time_2 | time_3 | params | data_input | objective | prediction | + | data/iris_unbalanced.csv | 30 | 30 | 120 | {"boosting": {"iterations": 5}, "number_of_models": 5} |{"petal width": 4} | 000004 | Iris-virginica | + | data/grades.csv | 30 | 30 | 120 | {"boosting": {"iterations": 5}, "number_of_models": 5} |{"Midterm": 20} | 000005 | 61.61036 | + + + Scenario Outline: Successfully comparing predictions for ensembles with proportional missing strategy + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble with "" + And I wait until the ensemble is ready less than secs + And I create a local ensemble + When I create a proportional missing strategy prediction with ensemble with "" for "" + Then the prediction for "" is "" + And the confidence for the prediction is + And I create a proportional missing strategy local prediction with ensemble with "" for "" + Then the local ensemble prediction is "" + And the local ensemble confidence is + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | params | data_input | objective | prediction | options | confidence | + | data/iris.csv | 30 | 30 | 120 | {"boosting": {"iterations": 5}} | {} | 000004 | Iris-virginica | {} | 0.33784 | + | data/iris.csv | 30 | 30 | 120 | {"number_of_models": 5} | {} | 000004 | Iris-versicolor | {"operating_kind": "confidence"} | 0.2923 | + | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {} | 000005 | 70.50579 | {} | 30.7161 | + | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {"Midterm": 20} | 000005 | 54.82214 | {"operating_kind": "confidence"} | 25.89672 | + | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {"Midterm": 20} | 000005 | 45.4573 | {} | 29.58403 | + | data/grades.csv | 30 | 30 | 120 | {"number_of_models": 5} | {"Midterm": 20, "Tutorial": 90, "TakeHome": 100} | 000005 | 42.814 | {} | 31.51804 | + + + Scenario Outline: Successfully comparing predictions for ensembles with operating point + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble + And I wait until the ensemble is ready less than secs + And I create a local ensemble + When I create a prediction with ensemble with operating point "" for "" + Then the prediction for "" is "" + When I create a local prediction with ensemble with operating point "" for "" + Then the local ensemble prediction is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_point | + | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-setosa | {"kind": "probability", "threshold": 0.1, "positive_class": "Iris-setosa"} | + | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-virginica | {"kind": "probability", "threshold": 0.9, "positive_class": "Iris-setosa"} | + | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4} | 000004 | Iris-setosa | {"kind": "confidence", "threshold": 0.1, "positive_class": "Iris-setosa"} | + | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4}| 000004 | Iris-versicolor | {"kind": "confidence", "threshold": 0.9, "positive_class": "Iris-setosa"} | + + + Scenario Outline: Successfully comparing predictions for ensembles with operating kind + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble + And I wait until the ensemble is ready less than secs + And I create a local ensemble + When I create a prediction with ensemble with operating kind "" for "" + Then the prediction for "" is "" + When I create a local prediction with ensemble with operating kind "" for "" + Then the local ensemble prediction is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_kind | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-versicolor | probability | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | probability | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-versicolor | confidence | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | confidence | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46} | 000004 | Iris-versicolor | votes | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | votes | + + + Scenario Outline: Successfully creating a local prediction from an Ensemble: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble of models + And I wait until the ensemble is ready less than secs + And I create a local ensemble + When the local ensemble prediction for "" is "" with confidence + #And the local probabilities are "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | number_of_models | data_input |prediction | confidence | + | data/iris.csv | 50 | 50 | 50 | 5 | {"petal width": 0.5} | Iris-versicolor | 0.3687 | ["0.3403","0.4150","0.2447"] | + + + Scenario Outline: Successfully obtaining field importance from an Ensemble: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + When I create a local Ensemble with the last models + Then the field importance text is "" + Then delete test data + + Examples: + | data | time_1 | time_2 |parms1 | time_3 |parms2 | time_4 |parms3| time_5 |number_of_models |field_importance | + | data/iris.csv | 50 | 50 |{"input_fields": ["000000", "000001","000003", "000004"]} |20 |{"input_fields": ["000000", "000001","000002", "000004"]} | 20 |{"input_fields": ["000000", "000001","000002", "000003", "000004"]} | 20 | 3 |[["000002", 0.5269933333333333], ["000003", 0.38936], ["000000", 0.04662333333333333], ["000001", 0.037026666666666666]] | + + + Scenario Outline: Successfully creating a local prediction from an Ensemble adding confidence: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble of models + And I wait until the ensemble is ready less than secs + And I create a local ensemble + #When I create a local ensemble prediction for "" in JSON adding confidence + When the local ensemble prediction for "" is "" with confidence + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | number_of_models | data_input |prediction | confidence | + | data/iris.csv | 50 | 50 | 50 | 5 | {"petal width": 0.5} | Iris-versicolor | 0.3687 | + + + Scenario Outline: Successfully creating a local prediction from an Ensemble: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble of models + And I wait until the ensemble is ready less than secs + And I create a local ensemble + When the local ensemble prediction using median with confidence for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | number_of_models | data_input |prediction | + | data/grades.csv | 50 | 50 | 50 | 2 | {} | 69.0934 | + + + Scenario Outline: Successfully creating a local prediction from an Ensemble with max models: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create an ensemble of models + And I wait until the ensemble is ready less than secs + And I create a local ensemble with max models + When the local ensemble prediction for "" is "" + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | number_of_models | max_models | data_input |prediction | + | data/iris.csv | 50 | 50 | 50 | 20 | 5 | 2 | {"petal width": 0.5} | Iris-versicolor | + \ No newline at end of file diff --git a/src/test/resources/test_14_create_evaluations.feature b/src/test/resources/test_evaluation.feature similarity index 63% rename from src/test/resources/test_14_create_evaluations.feature rename to src/test/resources/test_evaluation.feature index 340678a..ac2f606 100755 --- a/src/test/resources/test_14_create_evaluations.feature +++ b/src/test/resources/test_evaluation.feature @@ -1,8 +1,6 @@ -Feature: Create Evaluations - In order to create an evaluation - I need to create a model and a dataset first +Feature: Evaluation - Scenario Outline: Successfully creating an evaluation: + Scenario Outline: Successfully creating an evaluation for a model: Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset @@ -11,14 +9,14 @@ Feature: Create Evaluations And I wait until the model is ready less than secs When I create an evaluation for the model with the dataset And I wait until the evaluation is ready less than secs - Then the measured "" is + Then the measured "" is equals to Then delete test data Examples: | data | time_1 | time_2 | time_3 | time_4 | measure | value | | data/iris.csv | 50 | 50 | 50 | 50 | average_phi | 1 | - - + + Scenario Outline: Successfully creating an evaluation for an ensemble: Given I create a data source uploading a "" file And I wait until the source is ready less than secs @@ -34,8 +32,8 @@ Feature: Create Evaluations Examples: | data | time_1 | time_2 | number_of_models | time_3 | time_4 | measure | value | | data/iris.csv | 50 | 50 | 5 | 80 | 80 | average_phi | 0.97064 | - - + + Scenario Outline: Successfully creating an evaluation for a linear regression: Given I create a data source uploading a "" file And I wait until the source is ready less than secs @@ -43,7 +41,7 @@ Feature: Create Evaluations And I wait until the dataset is ready less than secs And I create a linearregression from a dataset And I wait until the linearregression is ready less than secs - When I create an evaluation for the linear regression with the dataset + When I create an evaluation for the linearregression with the dataset And I wait until the evaluation is ready less than secs Then the measured "" is equals to Then delete test data @@ -51,3 +49,27 @@ Feature: Create Evaluations Examples: | data | time_1 | time_2 | tlp | time_3 | time_4 | measure | value | | data/iris.csv | 50 | 50 | 5 | 800 | 80 | r_squared | 0.95382 | + + + Scenario Outline: Successfully an evaluation for a fusion: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a fusion from models + And I wait until the fusion is ready less than secs + When I create an evaluation for the fusion with the dataset + And I wait until the evaluation is ready less than secs + Then the measured "" is equals to + Then I delete the fusion + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | measure | value | + | data/iris.csv | 50 | 50 | 50 | 50 | average_phi | 1 | diff --git a/src/test/resources/test_43_linear_regression.feature b/src/test/resources/test_linearregression.feature similarity index 64% rename from src/test/resources/test_43_linear_regression.feature rename to src/test/resources/test_linearregression.feature index bce6968..64d69fa 100755 --- a/src/test/resources/test_43_linear_regression.feature +++ b/src/test/resources/test_linearregression.feature @@ -31,19 +31,22 @@ Feature: Linear regression | data/grades.csv | 100 | 100 | 200 | {"000000": 0.5, "000001": 1, "000002": 1, "000003": 1} | 2.27312 | - Scenario Outline: Successfully creating a batch prediction for a linear regression: + Scenario Outline: Successfully comparing predictions for linear regression Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset And I wait until the dataset is ready less than secs - And I create a linearregression from a dataset + And I create a linearregression with objective "" and params "" And I wait until the linearregression is ready less than secs - - When I create a batch prediction for the dataset with the linear regression - And I wait until the batch prediction is ready less than secs - And I download the created predictions file to "" - Then the batch prediction file "" is like "" + When I create a linearregression prediction for "" + Then the linearregression prediction is "" + And I create a local linearregression + And I create a local linearregression prediction for "" + Then the local linearregression prediction is "" + Then delete test data Examples: - | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | - | data/grades.csv | 30 | 30 | 50 | 50 | data/downloaded_batch_predictions_linear.csv | data/batch_predictions_linear.csv | + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | params | + | data/grades.csv | 50 | 50 | 30000 | {"000000": 1, "000001": 1, "000002": 1} | 000005 | 29.63024 | {"input_fields": ["000000", "000001", "000002"]} | + | data/iris.csv | 50 | 50 | 30000 | {"000000": 1, "000001": 1, "000004": "Iris-virginica"} | 000003 | 1.21187 | {"input_fields": ["000000", "000001", "000004"]} | + | data/movies.csv | 50 | 50 | 30000 | {"000007": "Action"} | 000009 | 4.33333 | {"input_fields": ["000007"]} | diff --git a/src/test/resources/test_05_compare_predictions.feature b/src/test/resources/test_logisticregression.feature similarity index 63% rename from src/test/resources/test_05_compare_predictions.feature rename to src/test/resources/test_logisticregression.feature index 7742ee2..6d16427 100755 --- a/src/test/resources/test_05_compare_predictions.feature +++ b/src/test/resources/test_logisticregression.feature @@ -1,120 +1,21 @@ -Feature: Create Predictions - In order to compare a remote prediction with a local prediction - I need to create a model first - Then I need to create a local model +Feature: Logistic Regressions + Scenario Outline: Successfully creating a logisticregression from a dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a logisticregression from a dataset + And I wait until the logisticregression is ready less than secs + And I update the logisticregression name to "" + When I wait until the logisticregression is ready less than secs + Then the logisticregression name is "" + Then delete test data - Scenario Outline: Successfully comparing predictions: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - When I create a prediction for "" - Then the prediction for "" is "" - Then the local prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | - | data/iris.csv | 50 | 50 | 50 | {"petal width": 0.5} | 000004 | Iris-setosa | - | data/iris.csv | 50 | 50 | 50 | {"petal length": 6, "petal width": 2} | 000004 | Iris-virginica | - | data/iris.csv | 50 | 50 | 50 | {"petal length": 4, "petal width": 1.5}| 000004 | Iris-versicolor | - | data/iris_sp_chars.csv | 50 | 50 | 50 | {"pétal.length": 4, "pétal&width\u0000": 1.5}| 000004 | Iris-versicolor | - - - Scenario Outline: Successfully comparing predictions with text options: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I update the source with "" waiting less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - When I create a prediction for "" - Then the prediction for "" is "" - And the local prediction for "" is "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | options | data_input | objective | prediction | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} |{"Message": "Mobile call"} | 000000 | spam | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} |{"Message": "A normal message"} | 000000 | ham | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": false, "use_stopwords": false, "language": "en"}}}} |{"Message": "Mobile calls"} | 000000 | spam | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": false, "use_stopwords": false, "language": "en"}}}} |{"Message": "A normal message"} | 000000 | ham | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": true, "use_stopwords": true, "language": "en"}}}}|{"Message": "Mobile call"} | 000000 | spam | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": true, "use_stopwords": true, "language": "en"}}}} |{"Message": "A normal message"} | 000000 | ham | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "full_terms_only", "language": "en"}}}} |{"Message": "FREE for 1st week! No1 Nokia tone 4 ur mob every week just txt NOKIA to 87077 Get txting and tell ur mates. zed POBox 36504 W45WQ norm150p/tone 16+"} | 000000 | spam | - | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "full_terms_only", "language": "en"}}}} |{"Message": "Ok"} | 000000 | ham | - #| data/text_missing.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "all", "language": "en"}}, "000000": {"optype": "text", "term_analysis": {"token_mode": "all", "language": "en"}}}} |{} | 000003 | swap | - - - Scenario Outline: Successfully comparing predictions with proportional missing strategy: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - When I create a proportional missing strategy prediction for "" - Then the prediction for "" is "" - And the confidence for the prediction is - And I create a local model - Then the proportional missing strategy local prediction for "" is "" - Then the confidence of the proportional missing strategy local prediction for "" is - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | confidence | - | data/iris.csv | 50 | 50 | 50 | {} | 000004 | Iris-setosa | 0.2629 | - - - Scenario Outline: Successfully comparing predictions with proportional missing strategy: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a local model - When I create a proportional missing strategy prediction for "" - Then the numerical prediction for "" is - And the confidence for the prediction is - Then the numerical prediction of proportional missing strategy local prediction for "" is - Then the confidence of the proportional missing strategy local prediction for "" is - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | confidence | - | data/grades.csv | 50 | 50 | 50 | {} | 000005 | 68.62224 | 27.5358 | - | data/grades.csv | 50 | 50 | 50 | {"Midterm": 20} | 000005 | 40.46667 | 54.89713 | - | data/grades.csv | 50 | 50 | 50 | {"Midterm": 20, "Tutorial": 90, "TakeHome": 500} | 000005 | 28.06 | 25.65806 | - - - Scenario Outline: Successfully comparing predictions with proportional missing strategy for missing_splits models: - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model with missing splits - And I wait until the model is ready less than secs - And I create a local model - When I create a proportional missing strategy prediction for "" - Then the prediction for "" is "" - And the confidence for the prediction is - And the proportional missing strategy local prediction for "" is "" - And the confidence of the proportional missing strategy local prediction for "" is - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | data_input | objective | prediction | confidence | - | data/iris_missing2.csv | 50 | 50 | 50 | {"petal width": 1} | 000004 | Iris-setosa | 0.8064 | - | data/iris_missing2.csv | 50 | 50 | 50 | {"petal width": 1, "petal length": 4} | 000004 | Iris-versicolor | 0.7847 | - + Examples: + | data | time_1 | time_2 | time_3 | time_4 | logisticregression_name | + | data/iris.csv | 10 | 10 | 20 | 20 | my new logisticregression name | + Scenario Outline: Successfully comparing logistic regression predictions Given I create a data source uploading a "" file @@ -277,4 +178,24 @@ Feature: Create Predictions Examples: | data | time_1 | time_2 | time_3 | data_input | prediction | options | | data/constant_field.csv | 50 | 50 | 50 | {"a": 1, "b": 1, "c": 1} | a | {"fields": {"000000": {"preferred": true}}} | + + + Scenario Outline: Successfully comparing predictions for logistic regressions with operating kind + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a logisticregression from a dataset + And I wait until the logisticregression is ready less than secs + And I create a local logisticregression + When I create a prediction with logisticregression with operating kind "" for "" + Then the prediction for "" is "" + When I create a local prediction with logisticregression with operating kind "" for "" + Then the local logisticregression prediction is "" + Then delete test data + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_kind | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 5} | 000004 | Iris-versicolor | probability | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | probability | + \ No newline at end of file diff --git a/src/test/resources/test_model.feature b/src/test/resources/test_model.feature new file mode 100755 index 0000000..97072c0 --- /dev/null +++ b/src/test/resources/test_model.feature @@ -0,0 +1,409 @@ +Feature: Model + + Scenario Outline: Successfully changing duplicated field names: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset with "" + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + Then "" field's name is changed to "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | options | field_id | new_name | + | data/iris.csv | 20 | 20 | 30 | {"tags": ["unitTest"], "fields": {"000001": {"name": "species"}}} | 000001 | species1 | + | data/iris.csv | 20 | 20 | 30 | {"tags": ["unitTest"], "fields": {"000001": {"name": "petal width"}}} | 000001 | petal width3 | + + + Scenario Outline: Successfully computing predictions combinations: + Given I create a MultiVote for the set of predictions in file + When I compute the prediction with confidence using method "" + Then the combined prediction is "" + And the confidence for the combined prediction is + Then delete test data + + Examples: + | predictions | method | prediction | confidence | + | data/predictions_c.json| 0 | a | 0.450471270879 | + | data/predictions_c.json| 1 | a | 0.552021302649 | + | data/predictions_c.json| 2 | a | 0.40363 | + + + Scenario Outline: Successfully computing predictions combinations: + Given I create a MultiVote for the set of predictions in file + When I compute the prediction with confidence using method "" + Then the numerical combined prediction is + And the confidence for the combined prediction is + Then delete test data + + Examples: + | predictions | method | prediction | confidence | + | data/predictions_r.json| 0 | 1.55555556667 | 0.400079152063 | + | data/predictions_r.json| 1 | 1.59376845074 | 0.248366474212 | + | data/predictions_r.json| 2 | 1.55555556667 | 0.400079152063 | + + + Scenario Outline: Successfully creating a prediction: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + When I create a prediction for "" + Then the prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | + | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | + | data/iris_sp_chars.csv | 10 | 10 | 10 | {"pétal&width\u0000": 0.5} | 000004 | Iris-setosa | + + + Scenario Outline: Successfully creating a prediction from a source in a remote location: + Given I create a data source using the url "" + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + When I create a prediction for "" + Then the prediction for "" is "" + Then delete test data + + Examples: + | url | time_1 | time_2 | time_3 | data_input | objective | prediction | + | s3://bigml-public/csv/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | + + + Scenario Outline: Successfully creating a prediction from inline data source: + Given I create a data source from inline data slurped from "" + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + When I create a prediction for "" + Then the prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | + | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | + + + Scenario Outline: Successfully creating a prediction from a local model in a json file: + Given I create a local model from a "" file + And the local prediction for "" is "" + And the confidence of the local prediction for "" is + Then delete test data + + Examples: + | model | data_input | prediction | confidence | + | data/iris_model.json | {"petal length": 0.5} | Iris-setosa | 0.90594 | + + + Scenario Outline: Successfully creating a multiple prediction from a local model in a json file: + Given I create a local model from a "" file + And the multiple local prediction for "" is "" + Then delete test data + + Examples: + | model | data_input | prediction | + | data/iris_model.json | {"petal length": 3} | [{"probability":0.5060240963855421,"confidence":0.4006020980792863,"prediction":"Iris-versicolor","count":42},{"probability":0.4939759036144578,"confidence":0.3890868795664999,"prediction":"Iris-virginica","count":41}] | + + + + Scenario Outline: Successfully creating a prediction from local model + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + Then the local prediction for "" is "" + Then the local prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | objective1 | prediction1 | objective2 | prediction2 | + | data/iris.csv | 15 | 15 | 15 | {"petal width": 0.5} | Iris-setosa | {"000003": 0.5} | Iris-setosa | + + + Scenario Outline: Successfully comparing predictions: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + When I create a prediction for "" + Then the prediction for "" is "" + Then the local prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | + | data/iris.csv | 50 | 50 | 50 | {"petal width": 0.5} | 000004 | Iris-setosa | + | data/iris.csv | 50 | 50 | 50 | {"petal length": 6, "petal width": 2} | 000004 | Iris-virginica | + | data/iris.csv | 50 | 50 | 50 | {"petal length": 4, "petal width": 1.5}| 000004 | Iris-versicolor | + | data/iris_sp_chars.csv | 50 | 50 | 50 | {"pétal.length": 4, "pétal&width\u0000": 1.5}| 000004 | Iris-versicolor | + + + Scenario Outline: Successfully comparing predictions with text options: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I update the source with "" waiting less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + When I create a prediction for "" + Then the prediction for "" is "" + And the local prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | options | data_input | objective | prediction | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} |{"Message": "Mobile call"} | 000000 | spam | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} |{"Message": "A normal message"} | 000000 | ham | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": false, "use_stopwords": false, "language": "en"}}}} |{"Message": "Mobile calls"} | 000000 | spam | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": false, "use_stopwords": false, "language": "en"}}}} |{"Message": "A normal message"} | 000000 | ham | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": true, "use_stopwords": true, "language": "en"}}}}|{"Message": "Mobile call"} | 000000 | spam | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": false, "stem_words": true, "use_stopwords": true, "language": "en"}}}} |{"Message": "A normal message"} | 000000 | ham | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "full_terms_only", "language": "en"}}}} |{"Message": "FREE for 1st week! No1 Nokia tone 4 ur mob every week just txt NOKIA to 87077 Get txting and tell ur mates. zed POBox 36504 W45WQ norm150p/tone 16+"} | 000000 | spam | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "full_terms_only", "language": "en"}}}} |{"Message": "Ok"} | 000000 | ham | + #| data/text_missing.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "all", "language": "en"}}, "000000": {"optype": "text", "term_analysis": {"token_mode": "all", "language": "en"}}}} |{} | 000003 | swap | + + + Scenario Outline: Successfully comparing predictions with proportional missing strategy: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + When I create a proportional missing strategy prediction for "" + Then the prediction for "" is "" + And the confidence for the prediction is + And I create a local model + Then the proportional missing strategy local prediction for "" is "" + Then the confidence of the proportional missing strategy local prediction for "" is + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | confidence | + | data/iris.csv | 50 | 50 | 50 | {} | 000004 | Iris-setosa | 0.2629 | + + + Scenario Outline: Successfully comparing predictions with proportional missing strategy: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + When I create a proportional missing strategy prediction for "" + Then the numerical prediction for "" is + And the confidence for the prediction is + Then the numerical prediction of proportional missing strategy local prediction for "" is + Then the confidence of the proportional missing strategy local prediction for "" is + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | confidence | + | data/grades.csv | 50 | 50 | 50 | {} | 000005 | 68.62224 | 27.5358 | + | data/grades.csv | 50 | 50 | 50 | {"Midterm": 20} | 000005 | 40.46667 | 54.89713 | + | data/grades.csv | 50 | 50 | 50 | {"Midterm": 20, "Tutorial": 90, "TakeHome": 500} | 000005 | 28.06 | 25.65806 | + + + Scenario Outline: Successfully comparing predictions with proportional missing strategy for missing_splits models: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model with missing splits + And I wait until the model is ready less than secs + And I create a local model + When I create a proportional missing strategy prediction for "" + Then the prediction for "" is "" + And the confidence for the prediction is + And the proportional missing strategy local prediction for "" is "" + And the confidence of the proportional missing strategy local prediction for "" is + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | confidence | + | data/iris_missing2.csv | 50 | 50 | 50 | {"petal width": 1} | 000004 | Iris-setosa | 0.8064 | + | data/iris_missing2.csv | 50 | 50 | 50 | {"petal width": 1, "petal length": 4} | 000004 | Iris-versicolor | 0.7847 | + + + Scenario Outline: Successfully comparing predictions for models with operating point + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + When I create a prediction with model with operating point "" for "" + Then the prediction for "" is "" + When I create a local prediction with model with operating point "" for "" + Then the local model prediction is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_point | + | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-setosa | {"kind": "probability", "threshold": 0.1, "positive_class": "Iris-setosa"} | + | data/iris.csv | 50 | 50 | 30000 | {"petal width": 4} | 000004 | Iris-versicolor | {"kind": "probability", "threshold": 0.9, "positive_class": "Iris-setosa"} | + | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4} | 000004 | Iris-setosa | {"kind": "confidence", "threshold": 0.1, "positive_class": "Iris-setosa"} | + | data/iris.csv | 50 | 50 | 30000 | {"sepal length": 4.1, "sepal width": 2.4}| 000004 | Iris-versicolor | {"kind": "confidence", "threshold": 0.9, "positive_class": "Iris-setosa"} | + + + Scenario Outline: Successfully comparing predictions for models with operating kind + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I create a local model + When I create a prediction with model with operating kind "" for "" + Then the prediction for "" is "" + When I create a local prediction with model with operating kind "" for "" + Then the local model prediction is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | operating_kind | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46, "sepal length": 5} | 000004 | Iris-versicolor | probability | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2.46, "sepal length": 5} | 000004 | Iris-versicolor | confidence | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | probability | + | data/iris.csv | 50 | 50 | 30000 | {"petal length": 2} | 000004 | Iris-setosa | confidence | + + + Scenario Outline: Successfully creating a model from a dataset list: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I store the dataset id in a list + And I create a dataset + And I wait until the dataset is ready less than secs + And I store the dataset id in a list + Then I create a model from a dataset list + And I wait until the model is ready less than secs + And I check the model stems from the original dataset list + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | + | data/iris.csv | 30 | 30 | 30 | 30 | + + + Scenario Outline: Successfully creating a model from a dataset list and predicting with it using median: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + Then I create a model + And I wait until the model is ready less than secs + And I create a local multi model + And I create a local mm median batch prediction using "" with prediction + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | input_data | prediction | + | data/grades.csv | 30 | 30 | 30 | {"Tutorial": 99.47, "Midterm": 53.12, "TakeHome": 87.96} | 63.33 | + + + Scenario Outline: Successfully creating a prediction from a multi model: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I retrieve a list of remote models tagged with "" + And I create a local multi model + Then the local multi prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | params | tag | data_input | prediction | + | data/iris.csv | 10 | 10 | 10 | {"tags":["mytag"]} | mytag | {"petal width": 0.5} | Iris-setosa | + + + Scenario Outline: Successfully creating a local batch prediction from a multi model: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I create a model with "" + And I wait until the model is ready less than secs + And I retrieve a list of remote models tagged with "" + And I create a local multi model + Then I create a batch multimodel prediction for "" and predictions "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | params | tag | data_inputs | predictions | + | data/iris.csv | 10 | 10 | 10 | {"tags":["mytag"]} | mytag | [{"petal width": 0.5}, {"petal length": 6, "petal width": 2}] | ["Iris-setosa", "Iris-virginica"] | + + + Scenario Outline: Successfully creating a prediction using a public model: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I make the model public + And I wait until the model is ready less than secs + And I check the model status using the model's public url + When I create a prediction for "" + Then the prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | + | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | + + + Scenario Outline: Successfully creating a prediction using a shared model: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a model + And I wait until the model is ready less than secs + And I make the model shared + And I wait until the model is ready less than secs + And I get the model sharing info + And I check the model status using the model's shared url + And I check the model status using the model's shared key + And I create a local model + When I create a prediction for "" + Then the prediction for "" is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | data_input | objective | prediction | + | data/iris.csv | 10 | 10 | 10 | {"petal width": 0.5} | 000004 | Iris-setosa | + + \ No newline at end of file diff --git a/src/test/resources/test_39_optiml_fusion.feature b/src/test/resources/test_optiml_fusion.feature similarity index 81% rename from src/test/resources/test_39_optiml_fusion.feature rename to src/test/resources/test_optiml_fusion.feature index 0b7b091..ab3974d 100755 --- a/src/test/resources/test_39_optiml_fusion.feature +++ b/src/test/resources/test_optiml_fusion.feature @@ -35,40 +35,13 @@ Feature: Testing REST api calls Then the fusion name is "" When I create a prediction for "" Then the prediction for "" is "" - When I create an evaluation for the fusion with the dataset - And I wait until the evaluation is ready less than secs - Then the measured "" is Then I delete the fusion Then delete test data Examples: - | data | time_1 | time_2 | time_3 | time_4 | fusion_name | data_input | objective | prediction | measure | value | - | data/iris.csv | 50 | 50 | 50 | 50 | my new fusion name | {"petal width": 1.75, "petal length": 2.45} | 000004 | Iris-setosa | average_phi | 1 | - - - Scenario Outline: Successfully creating a fusion - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a model - And I wait until the model is ready less than secs - And I create a fusion from models - And I wait until the fusion is ready less than secs - When I create a batch prediction for the dataset with the fusion - And I wait until the batch prediction is ready less than secs - And I download the created predictions file to "" - Then the batch prediction file "" is like "" - Then delete test data - - Examples: - | data | time_1 | time_2 | time_3 | time_4 | local_file | predictions_file | - | data/iris.csv | 50 | 50 | 50 | 50 | data/downloaded_batch_predictions.csv | data/batch_predictions_fs.csv | - + | data | time_1 | time_2 | time_3 | time_4 | fusion_name | data_input | objective | prediction | + | data/iris.csv | 50 | 50 | 50 | 50 | my new fusion name | {"petal width": 1.75, "petal length": 2.45} | 000004 | Iris-setosa | + Scenario Outline: Successfully creating fusion from models: Given I create a data source uploading a "" file diff --git a/src/test/resources/test_38_organization.feature b/src/test/resources/test_organization.feature similarity index 96% rename from src/test/resources/test_38_organization.feature rename to src/test/resources/test_organization.feature index 7cf73d1..3776e40 100755 --- a/src/test/resources/test_38_organization.feature +++ b/src/test/resources/test_organization.feature @@ -1,6 +1,6 @@ @beforeOrganizationScenario @afterOganizationScenario -Feature: Testing Organizations +Feature: Organizations Scenario Outline: Successfully creating a prediction in an organization: Given I create a data source uploading a "" file diff --git a/src/test/resources/test_pca.feature b/src/test/resources/test_pca.feature new file mode 100755 index 0000000..9e86d01 --- /dev/null +++ b/src/test/resources/test_pca.feature @@ -0,0 +1,76 @@ +Feature: PCA + + Scenario Outline: Successfully creating a PCA from a dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a pca from a dataset + And I wait until the pca is ready less than secs + And I update the pca name to "" + When I wait until the pca is ready less than secs + Then the pca name is "" + Then I delete the pca + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | pca_name | + | data/iris.csv | 50 | 50 | 100 | 100 | my new pca name | + + + Scenario Outline: Successfully creating a projection: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a pca from a dataset + And I wait until the pca is ready less than secs + When I create a projection for "" + Then the projection is "" + Then delete test data + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | data_input | projection | + | data/iris.csv | 50 | 50 | 100 | 100 | {"petal width": 0.5} | {"PC2": 0.1593, "PC3": -0.01286, "PC1": 0.91648, "PC6": 0.27284, "PC4": 1.29255, "PC5": 0.75196} | + + + Scenario Outline: Successfully comparing projections for PCAs + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a pca with "" + And I wait until the pca is ready less than secs + And I create a local pca + When I create a projection for "" + Then the projection is "" + And I create a local projection for "" + Then the local projection is "" + + Examples: + | data | time_1 | time_2 | time_3 | data_input | projection | params | + | data/iris.csv | 50 | 50 | 120 | {} | {"PC2": 0, "PC3": 0, "PC1": 0, "PC6": 0, "PC4": 5e-05, "PC5": 0} | {} | + | data/iris.csv | 50 | 50 | 120 | {"petal length": 1} | {"PC2": 0.08708, "PC3": 0.20929, "PC1": 1.56084, "PC6": -1.34463, "PC4": 0.7295, "PC5": -1.00876} | {} | + | data/iris.csv | 50 | 50 | 120 | {"species": "Iris-versicolor"} | {"PC2": 1.8602, "PC3": -2.00864, "PC1": -0.61116, "PC6": -0.66983, "PC4": -2.44618, "PC5": 0.43414} | {} | + | data/iris.csv | 50 | 50 | 120 | {"petal length": 1, "sepal length": 0, "petal width": 0, "sepal width": 0, "species": "Iris-versicolor"} | {"PC2": 7.18009, "PC3": 6.51511, "PC1": 2.78155, "PC6": 0.21372, "PC4": -1.94865, "PC5": 0.57646} | {} | + + + Scenario Outline: Successfully comparing projections for PCAs with text options + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I update the source with "" waiting less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a pca with "" + And I wait until the pca is ready less than secs + When I create a projection for "" + Then the projection is "" + And I create a local pca + And I create a local projection for "" + Then the local projection is "" + + Examples: + | data | time_1 | time_2 | time_3 | options | data_input | projection | params | + | data/spam_tiny.csv | 120 | 120 | 120 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "all"}}}} | {"Message": "early"} | {"PC40": 0.00416, "PC38": 0.08267, "PC39": 0.00033, "PC18": 0.28094, "PC19": -0.15056, "PC14": 0.20643, "PC15": 0.23931, "PC16": 0.03251, "PC17": 0.02776, "PC10": 0.1424, "PC11": 0.4059, "PC12": -0.1238, "PC13": 0.15131, "PC43": 0.29617, "PC42": 1.0091, "PC41": 0, "PC25": 0.07164, "PC24": -0.29904, "PC27": -0.1331, "PC26": -0.18572, "PC21": 0.25616, "PC20": 0.30424, "PC23": -0.45775, "PC22": -0.3362, "PC47": -0.13757, "PC49": 0.01864, "PC48": 0.04742, "PC29": -0.16286, "PC28": 0.42207, "PC32": -0.05917, "PC46": -0.05018, "PC31": -0.13973, "PC45": -0.05015, "PC36": 0.03017, "PC44": 0, "PC37": -0.06093, "PC34": 0.25821, "PC35": -0.22194, "PC33": -0.23398, "PC8": 0.01159, "PC9": -0.16042, "PC2": -0.09202, "PC3": 0.14371, "PC1": 0.65114, "PC6": -0.43034, "PC7": -0.02563, "PC4": -0.04947, "PC5": -0.07796, "PC50": -0.00769, "PC30": 0.07813} | {} | + | data/spam_tiny.csv | 120 | 120 | 120 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "all"}}}} | {"Message": "mobile call"} | {"PC40": 0.31818, "PC38": 0.06912, "PC39": -0.14342, "PC18": 0.22382, "PC19": 0.18518, "PC14": 0.89231, "PC15": 0.05046, "PC16": -0.00241, "PC17": 0.54501, "PC10": -0.26463, "PC11": 0.30251, "PC12": 1.16327, "PC13": 0.16973, "PC43": 0.11952, "PC42": 1.05499, "PC41": 0.51263, "PC25": 0.02467, "PC24": -0.65128, "PC27": 0.48916, "PC26": -0.45228, "PC21": -0.44167, "PC20": 0.76896, "PC23": 0.29398, "PC22": 0.06425, "PC47": 0.70416, "PC49": -0.30313, "PC48": 0.12976, "PC29": -0.34, "PC28": 0.17406, "PC32": -0.06411, "PC46": 0.69257, "PC31": 0.07523, "PC45": -0.03461, "PC36": 0.29732, "PC44": 0.14516, "PC37": -0.19109, "PC34": 0.58399, "PC35": 0.37608, "PC33": -0.00378, "PC8": -0.88156, "PC9": 0.38233, "PC2": -0.56685, "PC3": 0.56321, "PC1": 0.49171, "PC6": -0.09854, "PC7": -1.24639, "PC4": 1.50134, "PC5": -0.03161, "PC50": 0.17349, "PC30": -1.29612} | {} | + \ No newline at end of file diff --git a/src/test/resources/test_21_projects.feature b/src/test/resources/test_project.feature similarity index 85% rename from src/test/resources/test_21_projects.feature rename to src/test/resources/test_project.feature index f3a06a9..94c2eb1 100755 --- a/src/test/resources/test_21_projects.feature +++ b/src/test/resources/test_project.feature @@ -1,6 +1,4 @@ -Feature: Testing projects REST api calls - In order to test the project API - I need to create a project +Feature: Project REST api calls Scenario Outline: Successfully creating a project: Given I create a project with "" diff --git a/src/test/resources/test_16_sample_dataset.feature b/src/test/resources/test_sample_dataset.feature similarity index 96% rename from src/test/resources/test_16_sample_dataset.feature rename to src/test/resources/test_sample_dataset.feature index e1ca16b..282179a 100755 --- a/src/test/resources/test_16_sample_dataset.feature +++ b/src/test/resources/test_sample_dataset.feature @@ -1,6 +1,4 @@ Feature: Download the sample to filesystem - In order to download a sample - I need to create a sample Scenario Outline: Successfully creating a sample from a dataset: Given I create a data source uploading a "" file diff --git a/src/test/resources/test_26_statistical_test.feature b/src/test/resources/test_statisticaltest.feature similarity index 100% rename from src/test/resources/test_26_statistical_test.feature rename to src/test/resources/test_statisticaltest.feature diff --git a/src/test/resources/test_35_compare_predictions.feature b/src/test/resources/test_timeseries.feature old mode 100644 new mode 100755 similarity index 64% rename from src/test/resources/test_35_compare_predictions.feature rename to src/test/resources/test_timeseries.feature index 344f0bd..d5ad577 --- a/src/test/resources/test_35_compare_predictions.feature +++ b/src/test/resources/test_timeseries.feature @@ -1,6 +1,24 @@ -Feature: Testing - - Scenario Outline: Successfully comparing forecasts for Timeseries +Feature: TimeSeries + + Scenario Outline: Successfully creating forecasts from a dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create a time series from a dataset + And I wait until the time series is ready less than secs + And I update the time series name to "" + When I wait until the time series is ready less than secs + Then the time series name is "" + And I create a forecast for "" + Then the forecasts are "" + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | time_series_name | input_data | forecast_points | + | data/grades.csv | 100 | 100 | 200 | 500 | my new time series name | {"000005": {"horizon": 5}} | {"000005": [{"point_forecast": [73.96192, 74.04106, 74.12029, 74.1996, 74.27899], "model": "M,M,N"}]} | + + + Scenario Outline: Successfully comparing forecasts Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset @@ -21,7 +39,7 @@ Feature: Testing | data/grades.csv | 50 | 50 | 30000 | {"000005": {"horizon": 5}, "000001": {"horizon": 3, "ets_models": {"criterion": "aic", "limit": 2}}} | {"000005": [{"point_forecast": [73.96192, 74.04106, 74.12029, 74.1996, 74.27899], "model": "M,M,N"}], "000001": [{"point_forecast": [55.51577, 89.69111, 82.04935], "model": "A,N,A"}, {"point_forecast": [56.67419, 91.89657, 84.70017], "model": "A,A,A"}]} | {"objective_fields": ["000001", "000005"]} | - Scenario Outline: Successfully comparing forecasts for Timeseries with seasonality "A" + Scenario Outline: Successfully comparing forecasts with seasonality "A" Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset @@ -40,7 +58,8 @@ Feature: Testing | data/grades.csv | 50 | 50 | 30000 | {"000005": {"horizon": 5, "ets_models": {"names": ["M,N,A"], "criterion": "aic", "limit": 3}}} | {"000005": [{"point_forecast": [67.43222, 68.24468, 64.14437, 67.5662, 67.79028], "model": "M,N,A"}]} | {"objective_fields": ["000001", "000005"], "period" : 12 } | | data/grades.csv | 50 | 50 | 30000 | {"000005": {"horizon": 5, "ets_models": {"names": ["A,A,A"], "criterion": "aic", "limit": 3}}} | {"000005": [{"point_forecast": [74.73553, 71.6163, 71.90264, 76.4249, 75.06982], "model": "A,A,A"}]} | {"objective_fields": ["000001", "000005"], "period" : 12 } | - Scenario Outline: Successfully comparing forecasts for Timeseries with seasonality "M" + + Scenario Outline: Successfully comparing forecasts with seasonality "M" Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset @@ -60,7 +79,7 @@ Feature: Testing | data/grades.csv | 50 | 50 | 30000 | {"000005": {"horizon": 5, "ets_models": {"names": ["M,M,M"], "criterion": "aic", "limit": 3}}} | {"000005": [{"point_forecast": [71.75055, 80.67195, 70.81368, 79.84999, 78.27634], "model": "M,M,M"}]} | {"objective_fields": ["000001", "000005"], "period": 12 } | - Scenario Outline: Successfully comparing forecasts for Timeseries with trivial models + Scenario Outline: Successfully comparing forecasts with trivial models Given I create a data source uploading a "" file And I wait until the source is ready less than secs And I create a dataset @@ -81,44 +100,4 @@ Feature: Testing | data/grades.csv | 50 | 50 | 30000 | {"000005": {"horizon": 5, "ets_models": {"names": ["mean"]}}} | {"000005": [{"point_forecast": [68.45974, 68.45974, 68.45974, 68.45974, 68.45974], "model": "mean"}]} | {"objective_fields": ["000001", "000005"], "period": 1} | | data/grades.csv | 50 | 50 | 30000 | {"000005": {"horizon": 5, "ets_models": {"names": ["drift"]}}} | {"000005": [{"point_forecast": [61.50545, 61.6209, 61.73635, 61.8518, 61.96725], "model": "drift"}]} | {"objective_fields": ["000001", "000005"], "period": 1} | | data/grades.csv | 50 | 50 | 30000 | {"000005": {"horizon": 5, "ets_models": {"names": ["drift"]}}} | {"000005": [{"point_forecast": [61.50545, 61.6209, 61.73635, 61.8518, 61.96725], "model": "drift"}]} | {"objective_fields": ["000001", "000005"], "period": 2} | - - - Scenario Outline: Successfully comparing projections for PCAs - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a pca with "" - And I wait until the pca is ready less than secs - And I create a local pca - When I create a projection for "" - Then the projection is "" - And I create a local projection for "" - Then the local projection is "" - - Examples: - | data | time_1 | time_2 | time_3 | data_input | projection | params | - | data/iris.csv | 50 | 50 | 120 | {} | {"PC2": 0, "PC3": 0, "PC1": 0, "PC6": 0, "PC4": 5e-05, "PC5": 0} | {} | - | data/iris.csv | 50 | 50 | 120 | {"petal length": 1} | {"PC2": 0.08708, "PC3": 0.20929, "PC1": 1.56084, "PC6": -1.34463, "PC4": 0.7295, "PC5": -1.00876} | {} | - | data/iris.csv | 50 | 50 | 120 | {"species": "Iris-versicolor"} | {"PC2": 1.8602, "PC3": -2.00864, "PC1": -0.61116, "PC6": -0.66983, "PC4": -2.44618, "PC5": 0.43414} | {} | - | data/iris.csv | 50 | 50 | 120 | {"petal length": 1, "sepal length": 0, "petal width": 0, "sepal width": 0, "species": "Iris-versicolor"} | {"PC2": 7.18009, "PC3": 6.51511, "PC1": 2.78155, "PC6": 0.21372, "PC4": -1.94865, "PC5": 0.57646} | {} | - - - Scenario Outline: Successfully comparing projections for PCAs with text options - Given I create a data source uploading a "" file - And I wait until the source is ready less than secs - And I update the source with "" waiting less than secs - And I create a dataset - And I wait until the dataset is ready less than secs - And I create a pca with "" - And I wait until the pca is ready less than secs - When I create a projection for "" - Then the projection is "" - And I create a local pca - And I create a local projection for "" - Then the local projection is "" - - Examples: - | data | time_1 | time_2 | time_3 | options | data_input | projection | params | - | data/spam_tiny.csv | 120 | 120 | 120 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "all"}}}} | {"Message": "early"} | {"PC40": 0.00416, "PC38": 0.08267, "PC39": 0.00033, "PC18": 0.28094, "PC19": -0.15056, "PC14": 0.20643, "PC15": 0.23931, "PC16": 0.03251, "PC17": 0.02776, "PC10": 0.1424, "PC11": 0.4059, "PC12": -0.1238, "PC13": 0.15131, "PC43": 0.29617, "PC42": 1.0091, "PC41": 0, "PC25": 0.07164, "PC24": -0.29904, "PC27": -0.1331, "PC26": -0.18572, "PC21": 0.25616, "PC20": 0.30424, "PC23": -0.45775, "PC22": -0.3362, "PC47": -0.13757, "PC49": 0.01864, "PC48": 0.04742, "PC29": -0.16286, "PC28": 0.42207, "PC32": -0.05917, "PC46": -0.05018, "PC31": -0.13973, "PC45": -0.05015, "PC36": 0.03017, "PC44": 0, "PC37": -0.06093, "PC34": 0.25821, "PC35": -0.22194, "PC33": -0.23398, "PC8": 0.01159, "PC9": -0.16042, "PC2": -0.09202, "PC3": 0.14371, "PC1": 0.65114, "PC6": -0.43034, "PC7": -0.02563, "PC4": -0.04947, "PC5": -0.07796, "PC50": -0.00769, "PC30": 0.07813} | {} | - | data/spam_tiny.csv | 120 | 120 | 120 | {"fields": {"000001": {"optype": "text", "term_analysis": {"token_mode": "all"}}}} | {"Message": "mobile call"} | {"PC40": 0.31818, "PC38": 0.06912, "PC39": -0.14342, "PC18": 0.22382, "PC19": 0.18518, "PC14": 0.89231, "PC15": 0.05046, "PC16": -0.00241, "PC17": 0.54501, "PC10": -0.26463, "PC11": 0.30251, "PC12": 1.16327, "PC13": 0.16973, "PC43": 0.11952, "PC42": 1.05499, "PC41": 0.51263, "PC25": 0.02467, "PC24": -0.65128, "PC27": 0.48916, "PC26": -0.45228, "PC21": -0.44167, "PC20": 0.76896, "PC23": 0.29398, "PC22": 0.06425, "PC47": 0.70416, "PC49": -0.30313, "PC48": 0.12976, "PC29": -0.34, "PC28": 0.17406, "PC32": -0.06411, "PC46": 0.69257, "PC31": 0.07523, "PC45": -0.03461, "PC36": 0.29732, "PC44": 0.14516, "PC37": -0.19109, "PC34": 0.58399, "PC35": 0.37608, "PC33": -0.00378, "PC8": -0.88156, "PC9": 0.38233, "PC2": -0.56685, "PC3": 0.56321, "PC1": 0.49171, "PC6": -0.09854, "PC7": -1.24639, "PC4": 1.50134, "PC5": -0.03161, "PC50": 0.17349, "PC30": -1.29612} | {} | + \ No newline at end of file diff --git a/src/test/resources/test_topicmodel.feature b/src/test/resources/test_topicmodel.feature new file mode 100755 index 0000000..253162e --- /dev/null +++ b/src/test/resources/test_topicmodel.feature @@ -0,0 +1,55 @@ +Feature: Topic Model + + Scenario Outline: Successfully creating a topic model: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I update the source with "" waiting less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + When I create topic model from a dataset + Then I wait until the topic model is ready less than secs + + Examples: + | data | time_1 | time_2 | time_3 | options | + | data/movies.csv | 10 | 10 | 100 | {"fields": {"000007": {"optype": "items", "item_analysis": {"separator": "$"}}, "000006": {"optype": "text"}}} | + + + Scenario Outline: Successfully creating Topic Model from a dataset: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I update the source with "" waiting less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create topic model from a dataset + And I wait until the topic model is ready less than secs + And I update the topic model name to "" + When I wait until the topic model is ready less than secs + Then the topic model name is "" + + Examples: + | data | time_1 | time_2 | time_3 | time_4 | topic_model_name | params | + | data/spam.csv | 100 | 100 | 200 | 500 | my new topic model name | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} | + + + Scenario Outline: Successfully comparing topic distributions: + Given I create a data source uploading a "" file + And I wait until the source is ready less than secs + And I update the source with "" waiting less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + And I create topic model from a dataset + And I wait until the topic model is ready less than secs + + And I create a local topic model + When I create a local topic distribution for "" + Then the local topic distribution is "" + + When I create a topic distribution for "" + Then the topic distribution is "" + + Examples: + | data | time_1 | time_2 | time_3 | options | data_input | topic_distribution | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} | {"Type": "ham", "Message": "Mobile call"} | [0.51133, 0.00388, 0.00574, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.00388, 0.44801] | + | data/spam.csv | 20 | 20 | 30 | {"fields": {"000001": {"optype": "text", "term_analysis": {"case_sensitive": true, "stem_words": true, "use_stopwords": false, "language": "en"}}}} | {"Type": "ham", "Message": "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."} | [0.39188, 0.00643, 0.00264, 0.00643, 0.08112, 0.00264, 0.37352, 0.0115, 0.00707, 0.00327, 0.00264, 0.11086] | + + \ No newline at end of file diff --git a/src/test/resources/test_30_execution.feature b/src/test/resources/test_whizzml.feature similarity index 59% rename from src/test/resources/test_30_execution.feature rename to src/test/resources/test_whizzml.feature index 754f85d..6cd9c79 100755 --- a/src/test/resources/test_30_execution.feature +++ b/src/test/resources/test_whizzml.feature @@ -1,4 +1,30 @@ -Feature: Testing Whizzml Execution REST api calls +Feature: Whizzml REST api calls + + Scenario Outline: Scenario: Successfully creating a whizzml library: + Given I create a whizzml library from a excerpt of code "" + And I wait until the library is ready less than secs + And I update the library name to "" + When I wait until the library is ready less than secs + Then the library name is "" + Then the library code is "" + + Examples: + | source_code | time_1 | time_2 | library_name | + | (define (mu x) (+ x 1)) | 10 | 10 | my library | + + + Scenario Outline: Scenario: Successfully creating a whizzml script: + Given I create a whizzml script from a excerpt of code "" + And I wait until the script is ready less than secs + And I update the script name to "" + When I wait until the script is ready less than secs + Then the script name is "" + Then the script code is "" + + Examples: + | source_code | time_1 | time_2 | script_name | + | (+ 1 1) | 10 | 10 | my script | + Scenario Outline: Scenario: Successfully creating a whizzml script execution: Given I create a whizzml script from a excerpt of code ""