4545import copy
4646import json
4747
48+ try :
49+ import numpy as np
50+ from scipy .stats import t as student_t
51+ except ImportError :
52+ raise ImportError ("Failed to import the numpy and scipy modules needed"
53+ " for this class." )
54+
4855from functools import cmp_to_key
4956
5057from bigml .api import FINISHED
6774 "items" : "items" }
6875
6976CATEGORICAL = "categorical"
77+ CONFIDENCE = 0.95
7078
7179DUMMY = "dummy"
7280CONTRAST = "contrast"
@@ -109,9 +117,12 @@ def __init__(self, linear_regression, api=None):
109117 self .coefficients = []
110118 self .data_field_types = {}
111119 self .field_codings = {}
112- self .numeric_fields = {}
113120 self .bias = None
114-
121+ self .xtx = []
122+ self .inv_xtx = None
123+ self .mean_squared_error = None
124+ self .number_of_parameters = None
125+ self .number_of_samples = None
115126
116127 self .resource_id , linear_regression = get_resource_dict ( \
117128 linear_regression , "linearregression" , api = api )
@@ -143,6 +154,7 @@ def __init__(self, linear_regression, api=None):
143154 field_id for field_id , _ in
144155 sorted (self .fields .items (),
145156 key = lambda x : x [1 ].get ("column_number" ))]
157+ self .coeff_ids = self .input_fields [:]
146158 self .coefficients = linear_regression_info .get ( \
147159 'coefficients' , [])
148160 self .bias = linear_regression_info .get ('bias' , True )
@@ -155,16 +167,26 @@ def __init__(self, linear_regression, api=None):
155167 numerics = True )
156168 self .field_codings = linear_regression_info .get ( \
157169 'field_codings' , {})
158- print "**before" , self .field_codings
159170 self .format_field_codings ()
160- print "**after" , self .field_codings
161171 for field_id in self .field_codings :
162172 if field_id not in fields and \
163173 field_id in self .inverted_fields :
164174 self .field_codings .update ( \
165175 {self .inverted_fields [field_id ]: \
166176 self .field_codings [field_id ]})
167177 del self .field_codings [field_id ]
178+ stats = linear_regression_info ["stats" ]
179+ if stats is not None and "xtx" in stats :
180+ self .xtx = stats ["xtx" ][:]
181+ self .mean_squared_error = stats ["mean_squared_error" ]
182+ self .number_of_parameters = stats ["number_of_parameters" ]
183+ self .number_of_samples = stats ["number_of_samples" ]
184+ # to be used in predictions
185+ self .t_crit = student_t .interval ( \
186+ CONFIDENCE ,
187+ self .number_of_samples - self .number_of_parameters )[1 ]
188+ self .inv_xtx = list (np .linalg .inv (np .array (self .xtx )))
189+
168190 else :
169191 raise Exception ("The linear regression isn't finished yet" )
170192 else :
@@ -173,7 +195,7 @@ def __init__(self, linear_regression, api=None):
173195 " in the resource:\n \n %s" %
174196 linear_regression )
175197
176- def expand_input (self , input_data , unique_terms ):
198+ def expand_input (self , input_data , unique_terms , compact = False ):
177199 """ Creates an input array with the values in input_data and
178200 unique_terms and the following rules:
179201 - fields are ordered as input_fields
@@ -187,7 +209,7 @@ def expand_input(self, input_data, unique_terms):
187209 as numerics.
188210 """
189211 input_array = []
190- for index , field_id in enumerate (self .input_fields ):
212+ for index , field_id in enumerate (self .coeff_ids ):
191213 field = self .fields [field_id ]
192214 optype = field ["optype" ]
193215 missing = False
@@ -216,7 +238,7 @@ def expand_input(self, input_data, unique_terms):
216238
217239 if optype == CATEGORICAL :
218240 new_inputs = self .categorical_encoding ( \
219- new_inputs , field_id )
241+ new_inputs , field_id , compact )
220242
221243 input_array .extend (new_inputs )
222244
@@ -225,7 +247,7 @@ def expand_input(self, input_data, unique_terms):
225247
226248 return input_array
227249
228- def categorical_encoding (self , inputs , field_id ):
250+ def categorical_encoding (self , inputs , field_id , compact ):
229251 """Returns the prediction and the confidence intervals
230252
231253 input_data: Input data to be predicted
@@ -235,10 +257,17 @@ def categorical_encoding(self, inputs, field_id):
235257
236258 projections = self .field_codings [field_id ].get ( \
237259 CONTRAST , self .field_codings [field_id ].get (OTHER ))
238- print "***" , projections , new_inputs
239260 if projections is not None :
240261 new_inputs = flatten (dot (projections , [new_inputs ]))
241262
263+ if compact and self .field_codings [field_id ].get (DUMMY ) is not None :
264+ dummy_class = self .field_codings [field_id ][DUMMY ]
265+ index = self .categories [field_id ].index (dummy_class )
266+ cat_new_inputs = new_inputs [0 : index ]
267+ if len (new_inputs ) > (index + 1 ):
268+ cat_new_inputs .extend (new_inputs [index + 1 :])
269+ new_inputs = cat_new_inputs
270+
242271 return new_inputs
243272
244273 def predict (self , input_data , full = False ):
@@ -275,19 +304,46 @@ def predict(self, input_data, full=False):
275304
276305 # Creates an input vector with the values for all expanded fields.
277306 input_array = self .expand_input (new_data , unique_terms )
307+ compact_input_array = self .expand_input (new_data , unique_terms , True )
278308
279309 prediction = dot ([flatten (self .coefficients )], [input_array ])[0 ][0 ]
280310
281311 result = {
282312 "prediction" : prediction }
313+ if self .inv_xtx is not None :
314+ result .update ({"confidence_bounds" : self .confidence_bounds ( \
315+ compact_input_array )})
283316
284317 if full :
285- result .update ({' unused_fields' : unused_fields })
318+ result .update ({" unused_fields" : unused_fields })
286319 else :
287320 result = result ["prediction" ]
288321
289322 return result
290323
324+
325+ def confidence_bounds (self , input_array ):
326+ """Computes the confidence interval for the prediction
327+
328+ """
329+ product = dot (dot ([input_array ], self .inv_xtx ),
330+ [input_array ])[0 ][0 ]
331+ try :
332+
333+ if self .mean_squared_error != 0 :
334+ confidence_interval = self .t_crit * math .sqrt ( \
335+ self .mean_squared_error * product )
336+ prediction_interval = self .t_crit * math .sqrt ( \
337+ self .mean_squared_error * (product + 1 ))
338+ else :
339+ confidence_interval , prediction_interval = (0 , 0 )
340+ except Exception :
341+ confidence_interval , prediction_interval = (0 , 0 )
342+
343+ return {"confidence_interval" : confidence_interval ,
344+ "prediction_interval" : prediction_interval }
345+
346+
291347 def format_field_codings (self ):
292348 """ Changes the field codings format to the dict notation
293349
0 commit comments