forked from briandalessandro/DataScienceCourse
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval_plots.py
More file actions
94 lines (70 loc) · 2.67 KB
/
eval_plots.py
File metadata and controls
94 lines (70 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def getMAE(pred, truth):
return np.abs(truth - pred).mean()
def getLL(pred, truth):
ll_sum = 0
for i in range(len(pred)):
if (pred[i] == 0):
p = 0.0001
elif (pred[i] == 1):
p = 0.9999
else:
p = pred[i]
ll_sum += truth[i]*np.log(p)+(1-truth[i])*np.log(1-p)
return (ll_sum)/len(pred)
def plotCalib(truth, pred, bins = 100, f = 0, l = '', w = 8, h = 8, fig_i = 1, fig_j = 1, fig_k = 1):
mae = np.round(getMAE(pred, truth),3)
ll = np.round(getLL(pred, truth), 3)
d = pd.DataFrame({'p':pred, 'y':truth})
d['p_bin'] = np.floor(d['p']*bins)/bins
d_bin = d.groupby(['p_bin']).agg([np.mean, len])
filt = (d_bin['p']['len']>f)
if fig_k == 1:
fig = plt.figure(facecolor = 'w', figsize = (w, h))
x = d_bin['p']['mean'][filt].values
y = d_bin['y']['mean'][filt].values
n = d_bin['y']['len'][filt].values
stderr = np.sqrt(y * (1 - y)/n)
ax = plt.subplot(fig_i, fig_j, fig_k)
#plt.plot(x, y, 'b.', markersize = 9)
plt.errorbar(x, y, yerr = 1.96 * stderr, fmt = 'o')
plt.plot([0.0, 1.0], [0.0, 1.0], 'k-')
plt.title(l + ':' + ' MAE = {}, LL = {}'.format(mae, ll))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('prediction P(Y|X)')
plt.ylabel('actual P(Y|X)')
#plt.legend(loc=4)
def liftTable(pred, truth, b):
df = pd.DataFrame({'p':pred + np.random.rand(len(pred))*0.000001, 'y':truth})
df['b'] = b - pd.qcut(df['p'], b, labels=False)
df['n'] = np.ones(df.shape[0])
df_grp = df.groupby(['b']).sum()
tot_y = float(np.sum(df_grp['y']))
base = tot_y/float(df.shape[0])
df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0])
df_grp['y_cum'] = np.cumsum(df_grp['y'])
df_grp['p_y_b'] = df_grp['y']/df_grp['n']
df_grp['lift_b'] = df_grp['p_y_b']/base
df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base
df_grp['recall'] = df_grp['y_cum']/tot_y
return df_grp
def liftRecallCurve(pred, truth, b, h = 6, w = 12, title = ''):
#Get the lift table
lt = liftTable(pred, truth, b)
fig, ax1 = plt.subplots(figsize = (w, h))
ax1.plot(lt['n_cum'], lt['cum_lift_b'], 'b-')
ax1.set_xlabel('Quantile')
# Make the y-axis label and tick labels match the line color.
ax1.set_ylabel('Lift', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
ax2 = ax1.twinx()
ax2.plot(lt['n_cum'], lt['recall'], 'r.')
ax2.set_ylabel('Recall', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
plt.title(title)
plt.show()