bigmlcom
diff --git a/‎bigml/linear.py‎
Lines changed: 66 additions & 10 deletions b/‎bigml/linear.py‎
Lines changed: 66 additions & 10 deletions
diff --git a/‎bigml/modelfields.py‎
Lines changed: 4 additions & 0 deletions b/‎bigml/modelfields.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎bigml/tests/create_linear_steps.py‎
Lines changed: 104 additions & 0 deletions b/‎bigml/tests/create_linear_steps.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎bigml/tests/create_pca_steps_bck.py‎
Lines changed: 93 additions & 0 deletions b/‎bigml/tests/create_pca_steps_bck.py‎
Lines changed: 93 additions & 0 deletions
@@ -45,6 +45,13 @@
 import copy
 import json
 
+try:
+    import numpy as np
+    from scipy.stats import t as student_t
+except ImportError:
+        raise ImportError("Failed to import the numpy and scipy modules needed"
+                          " for this class.")
+
 from functools import cmp_to_key
 
 from bigml.api import FINISHED
@@ -67,6 +74,7 @@
                         "items": "items"}
 
 CATEGORICAL = "categorical"
+CONFIDENCE = 0.95
 
 DUMMY = "dummy"
 CONTRAST = "contrast"
@@ -109,9 +117,12 @@ def __init__(self, linear_regression, api=None):
         self.coefficients = []
         self.data_field_types = {}
         self.field_codings = {}
-        self.numeric_fields = {}
         self.bias = None
-
+        self.xtx = []
+        self.inv_xtx = None
+        self.mean_squared_error = None
+        self.number_of_parameters = None
+        self.number_of_samples = None
 
         self.resource_id, linear_regression = get_resource_dict( \
             linear_regression, "linearregression", api=api)
@@ -143,6 +154,7 @@ def __init__(self, linear_regression, api=None):
                         field_id for field_id, _ in
                         sorted(self.fields.items(),
                                key=lambda x: x[1].get("column_number"))]
+                self.coeff_ids = self.input_fields[:]
                 self.coefficients = linear_regression_info.get( \
                     'coefficients', [])
                 self.bias = linear_regression_info.get('bias', True)
@@ -155,16 +167,26 @@ def __init__(self, linear_regression, api=None):
                     numerics=True)
                 self.field_codings = linear_regression_info.get( \
                   'field_codings', {})
-                print "**before", self.field_codings
                 self.format_field_codings()
-                print "**after", self.field_codings
                 for field_id in self.field_codings:
                     if field_id not in fields and \
                             field_id in self.inverted_fields:
                         self.field_codings.update( \
                             {self.inverted_fields[field_id]: \
                              self.field_codings[field_id]})
                         del self.field_codings[field_id]
+                stats = linear_regression_info["stats"]
+                if stats is not None and "xtx" in stats:
+                    self.xtx = stats["xtx"][:]
+                    self.mean_squared_error = stats["mean_squared_error"]
+                    self.number_of_parameters = stats["number_of_parameters"]
+                    self.number_of_samples = stats["number_of_samples"]
+                    # to be used in predictions
+                    self.t_crit = student_t.interval( \
+                        CONFIDENCE,
+                        self.number_of_samples - self.number_of_parameters)[1]
+                    self.inv_xtx = list(np.linalg.inv(np.array(self.xtx)))
+
             else:
                 raise Exception("The linear regression isn't finished yet")
         else:
@@ -173,7 +195,7 @@ def __init__(self, linear_regression, api=None):
                             " in the resource:\n\n%s" %
                             linear_regression)
 
-    def expand_input(self, input_data, unique_terms):
+    def expand_input(self, input_data, unique_terms, compact=False):
         """ Creates an input array with the values in input_data and
         unique_terms and the following rules:
         - fields are ordered as input_fields
@@ -187,7 +209,7 @@ def expand_input(self, input_data, unique_terms):
           as numerics.
         """
         input_array = []
-        for index, field_id in enumerate(self.input_fields):
+        for index, field_id in enumerate(self.coeff_ids):
             field = self.fields[field_id]
             optype = field["optype"]
             missing = False
@@ -216,7 +238,7 @@ def expand_input(self, input_data, unique_terms):
 
             if optype == CATEGORICAL:
                 new_inputs = self.categorical_encoding( \
-                    new_inputs, field_id)
+                    new_inputs, field_id, compact)
 
             input_array.extend(new_inputs)
 
@@ -225,7 +247,7 @@ def expand_input(self, input_data, unique_terms):
 
         return input_array
 
-    def categorical_encoding(self, inputs, field_id):
+    def categorical_encoding(self, inputs, field_id, compact):
         """Returns the prediction and the confidence intervals
 
         input_data: Input data to be predicted
@@ -235,10 +257,17 @@ def categorical_encoding(self, inputs, field_id):
 
         projections = self.field_codings[field_id].get( \
                 CONTRAST, self.field_codings[field_id].get(OTHER))
-        print "***", projections, new_inputs
         if projections is not None:
             new_inputs = flatten(dot(projections, [new_inputs]))
 
+        if compact and self.field_codings[field_id].get(DUMMY) is not None:
+            dummy_class = self.field_codings[field_id][DUMMY]
+            index = self.categories[field_id].index(dummy_class)
+            cat_new_inputs = new_inputs[0: index]
+            if len(new_inputs) > (index + 1):
+                cat_new_inputs.extend(new_inputs[index + 1 :])
+            new_inputs = cat_new_inputs
+
         return new_inputs
 
     def predict(self, input_data, full=False):
@@ -275,19 +304,46 @@ def predict(self, input_data, full=False):
 
         # Creates an input vector with the values for all expanded fields.
         input_array = self.expand_input(new_data, unique_terms)
+        compact_input_array = self.expand_input(new_data, unique_terms, True)
 
         prediction = dot([flatten(self.coefficients)], [input_array])[0][0]
 
         result = {
             "prediction": prediction}
+        if self.inv_xtx is not None:
+            result.update({"confidence_bounds": self.confidence_bounds( \
+                compact_input_array)})
 
         if full:
-            result.update({'unused_fields': unused_fields})
+            result.update({"unused_fields": unused_fields})
         else:
             result = result["prediction"]
 
         return result
 
+
+    def confidence_bounds(self, input_array):
+        """Computes the confidence interval for the prediction
+
+        """
+        product = dot(dot([input_array], self.inv_xtx),
+                      [input_array])[0][0]
+        try:
+
+            if self.mean_squared_error != 0:
+                confidence_interval = self.t_crit * math.sqrt( \
+                    self.mean_squared_error * product)
+                prediction_interval = self.t_crit * math.sqrt( \
+                    self.mean_squared_error * (product + 1))
+            else:
+                confidence_interval, prediction_interval = (0, 0)
+        except Exception:
+                confidence_interval, prediction_interval = (0, 0)
+
+        return {"confidence_interval": confidence_interval,
+                "prediction_interval": prediction_interval}
+
+
     def format_field_codings(self):
         """ Changes the field codings format to the dict notation
 
 
@@ -198,6 +198,10 @@ def add_terms(self, categories=False, numerics=False):
             if categories and field['optype'] == 'categorical':
                 self.categories[field_id] = [category for \
                     [category, _] in field['summary']['categories']]
+            if field['optype'] == 'datetime' and \
+                    hasattr(self, coeff_ids):
+                self.coeff_id  = [coeff_id for coeff_id in self.coeff_ids \
+                    if coeff_id != field_id]
             if numerics and hasattr(self, "missing_numerics") and \
                     self.missing_numerics and field['optype'] == 'numeric' \
                     and hasattr(self, "numeric_fields"):
 
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env python
+#
+# Copyright 2019 BigML
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import time
+import json
+import os
+from datetime import datetime, timedelta
+from world import world
+from nose.tools import eq_, assert_less
+
+from bigml.api import HTTP_CREATED
+from bigml.api import HTTP_ACCEPTED
+from bigml.api import FINISHED
+from bigml.api import FAULTY
+from bigml.api import get_status
+
+from read_linear_steps import i_get_the_linear_regression
+
+
+#@step(r'the linear name is "(.*)"')
+def i_check_linear_name(step, name):
+    linear_name = world.linear_regression['name']
+    eq_(name, linear_name)
+
+#@step(r'I create a Linear Regression from a dataset$')
+def i_create_a_linear_regression_from_dataset(step):
+    dataset = world.dataset.get('resource')
+    resource = world.api.create_linear_regression( \
+        dataset, {'name': 'new linear regression'})
+    world.status = resource['code']
+    eq_(world.status, HTTP_CREATED)
+    world.location = resource['location']
+    world.linear_regression = resource['object']
+    world.linear_regressions.append(resource['resource'])
+
+
+#@step(r'I create a Linear Regression from a dataset$')
+def i_create_a_linear_regression_with_params(step, params):
+    i_create_a_linear_regression_with_objective_and_params(step, None, params)
+
+
+#@step(r'I create a Linear Regression with objective and params$')
+def i_create_a_linear_regression_with_objective_and_params(step,
+                                                           objective,
+                                                           params):
+    params = json.loads(params)
+    if objective is not None:
+        params.update({"objective_field": objective})
+    dataset = world.dataset.get('resource')
+    resource = world.api.create_linear_regression(dataset, params)
+    world.status = resource['code']
+    eq_(world.status, HTTP_CREATED)
+    world.location = resource['location']
+    world.linear_regression = resource['object']
+    world.linear_regressions.append(resource['resource'])
+
+def i_create_a_linear_regression(step):
+    i_create_a_linear_regression_from_dataset(step)
+
+
+#@step(r'I update the linear regression name to "(.*)"$')
+def i_update_linear_regression_name(step, name):
+    resource = world.api.update_linear_regression( \
+        world.linear_regression['resource'],
+        {'name': name})
+    world.status = resource['code']
+    eq_(world.status, HTTP_ACCEPTED)
+    world.location = resource['location']
+    world.linear_regression = resource['object']
+
+
+#@step(r'I wait until the linear regression status code is either (\d) or (-\d) less than (\d+)')
+def wait_until_linear_regression_status_code_is(step, code1, code2, secs):
+    start = datetime.utcnow()
+    delta = int(secs) * world.delta
+    linear_regression_id = world.linear_regression['resource']
+    i_get_the_linear_regression(step, linear_regression_id)
+    status = get_status(world.linear_regression)
+    while (status['code'] != int(code1) and
+           status['code'] != int(code2)):
+           time.sleep(3)
+           assert_less(datetime.utcnow() - start, timedelta(seconds=delta))
+           i_get_the_linear_regression(step, linear_regression_id)
+           status = get_status(world.linear_regression)
+    eq_(status['code'], int(code1))
+
+
+#@step(r'I wait until the linear is ready less than (\d+)')
+def the_linear_regression_is_finished_in_less_than(step, secs):
+    wait_until_linear_regression_status_code_is(step, FINISHED, FAULTY, secs)
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env python
+#
+# Copyright 2018-2019 BigML
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import time
+import json
+import os
+from datetime import datetime, timedelta
+from world import world
+from nose.tools import eq_, assert_less
+
+from bigml.api import HTTP_CREATED
+from bigml.api import HTTP_ACCEPTED
+from bigml.api import FINISHED
+from bigml.api import FAULTY
+from bigml.api import get_status
+
+from read_pca_steps import i_get_the_pca
+
+
+#@step(r'the pca name is "(.*)"')
+def i_check_pca_name(step, name):
+    pca_name = world.pca['name']
+    eq_(name, pca_name)
+
+#@step(r'I create a PCA from a dataset$')
+def i_create_a_pca_from_dataset(step):
+    dataset = world.dataset.get('resource')
+    resource = world.api.create_pca(dataset, {'name': 'new PCA'})
+    world.status = resource['code']
+    eq_(world.status, HTTP_CREATED)
+    world.location = resource['location']
+    world.pca = resource['object']
+    world.pcas.append(resource['resource'])
+
+
+#@step(r'I create a PCA from a dataset$')
+def i_create_a_pca_with_params(step, params):
+    params = json.loads(params)
+    dataset = world.dataset.get('resource')
+    resource = world.api.create_pca(dataset, params)
+    world.status = resource['code']
+    eq_(world.status, HTTP_CREATED)
+    world.location = resource['location']
+    world.pca = resource['object']
+    world.pcas.append(resource['resource'])
+
+def i_create_a_pca(step):
+    i_create_a_pca_from_dataset(step)
+
+
+#@step(r'I update the PCA name to "(.*)"$')
+def i_update_pca_name(step, name):
+    resource = world.api.update_pca(world.pca['resource'],
+                                    {'name': name})
+    world.status = resource['code']
+    eq_(world.status, HTTP_ACCEPTED)
+    world.location = resource['location']
+    world.pca = resource['object']
+
+
+#@step(r'I wait until the PCA status code is either (\d) or (-\d) less than (\d+)')
+def wait_until_pca_status_code_is(step, code1, code2, secs):
+    start = datetime.utcnow()
+    delta = int(secs) * world.delta
+    pca_id = world.pca['resource']
+    i_get_the_pca(step, pca_id)
+    status = get_status(world.pca)
+    while (status['code'] != int(code1) and
+           status['code'] != int(code2)):
+           time.sleep(3)
+           assert_less(datetime.utcnow() - start, timedelta(seconds=delta))
+           i_get_the_pca(step, pca_id)
+           status = get_status(world.pca)
+    eq_(status['code'], int(code1))
+
+
+#@step(r'I wait until the PCA is ready less than (\d+)')
+def the_pca_is_finished_in_less_than(step, secs):
+    wait_until_pca_status_code_is(step, FINISHED, FAULTY, secs)