Skip to content

Commit a4cbc34

Browse files
committed
GLM cut_column done
1 parent 095c185 commit a4cbc34

1 file changed

Lines changed: 344 additions & 0 deletions

File tree

Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 31,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline\n",
12+
"import h2o\n",
13+
"\n",
14+
"from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
15+
"from h2o.estimators.random_forest import H2ORandomForestEstimator\n",
16+
"from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n",
17+
"import pandas as pd\n",
18+
"import numpy as np\n",
19+
"import matplotlib.pyplot as plt"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 3,
25+
"metadata": {
26+
"collapsed": false
27+
},
28+
"outputs": [
29+
{
30+
"data": {
31+
"text/html": [
32+
"<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime: </td>\n",
33+
"<td>28 minutes 30 seconds 62 milliseconds </td></tr>\n",
34+
"<tr><td>H2O cluster version: </td>\n",
35+
"<td>3.7.0.3248</td></tr>\n",
36+
"<tr><td>H2O cluster name: </td>\n",
37+
"<td>H2O_started_from_python</td></tr>\n",
38+
"<tr><td>H2O cluster total nodes: </td>\n",
39+
"<td>1</td></tr>\n",
40+
"<tr><td>H2O cluster total memory: </td>\n",
41+
"<td>1.78 GB</td></tr>\n",
42+
"<tr><td>H2O cluster total cores: </td>\n",
43+
"<td>8</td></tr>\n",
44+
"<tr><td>H2O cluster allowed cores: </td>\n",
45+
"<td>8</td></tr>\n",
46+
"<tr><td>H2O cluster healthy: </td>\n",
47+
"<td>True</td></tr>\n",
48+
"<tr><td>H2O Connection ip: </td>\n",
49+
"<td>127.0.0.1</td></tr>\n",
50+
"<tr><td>H2O Connection port: </td>\n",
51+
"<td>54321</td></tr></table></div>"
52+
],
53+
"text/plain": [
54+
"-------------------------- -------------------------------------\n",
55+
"H2O cluster uptime: 28 minutes 30 seconds 62 milliseconds\n",
56+
"H2O cluster version: 3.7.0.3248\n",
57+
"H2O cluster name: H2O_started_from_python\n",
58+
"H2O cluster total nodes: 1\n",
59+
"H2O cluster total memory: 1.78 GB\n",
60+
"H2O cluster total cores: 8\n",
61+
"H2O cluster allowed cores: 8\n",
62+
"H2O cluster healthy: True\n",
63+
"H2O Connection ip: 127.0.0.1\n",
64+
"H2O Connection port: 54321\n",
65+
"-------------------------- -------------------------------------"
66+
]
67+
},
68+
"metadata": {},
69+
"output_type": "display_data"
70+
}
71+
],
72+
"source": [
73+
"h2o.init()"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": 99,
79+
"metadata": {
80+
"collapsed": false
81+
},
82+
"outputs": [
83+
{
84+
"name": "stdout",
85+
"output_type": "stream",
86+
"text": [
87+
"\n",
88+
"Parse Progress: [##################################################] 100%\n"
89+
]
90+
}
91+
],
92+
"source": [
93+
"h2o.remove_all()\n",
94+
"covtype_df = h2o.import_file(\"../data/covtype.full.csv\")\n",
95+
"#split the data as described above\n",
96+
"train, valid, test = covtype_df.split_frame([0.6, 0.2], seed=1234)\n",
97+
"\n",
98+
"#Prepare predictors and response columns\n",
99+
"covtype_X = covtype_df.col_names[:-1] #last column is Cover_Type, our desired response variable \n",
100+
"covtype_y = covtype_df.col_names[-1] \n",
101+
"\n",
102+
"train_df = train.as_data_frame(True)\n",
103+
"valid_df = valid.as_data_frame(True)\n",
104+
"test_df = test.as_data_frame(True)"
105+
]
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": 95,
110+
"metadata": {
111+
"collapsed": true
112+
},
113+
"outputs": [],
114+
"source": [
115+
"col = 'Elevation'\n"
116+
]
117+
},
118+
{
119+
"cell_type": "code",
120+
"execution_count": 88,
121+
"metadata": {
122+
"collapsed": true
123+
},
124+
"outputs": [],
125+
"source": [
126+
"def place(value, breaks, len_breaks, range_cache):\n",
127+
" for k in range_cache:\n",
128+
" if value <breaks[k+1]:\n",
129+
" return k\n",
130+
" return len_breaks"
131+
]
132+
},
133+
{
134+
"cell_type": "code",
135+
"execution_count": 96,
136+
"metadata": {
137+
"collapsed": false
138+
},
139+
"outputs": [
140+
{
141+
"name": "stdout",
142+
"output_type": "stream",
143+
"text": [
144+
"Wall time: 5.33 s\n"
145+
]
146+
}
147+
],
148+
"source": [
149+
"%%time\n",
150+
"col_col= train.as_data_frame(True)[col]\n",
151+
"c, breaks = np.histogram(col_col, bins=20)\n",
152+
"min_val = min(col_col)-1\n",
153+
"max_val = max(col_col)+1\n",
154+
"new_b = [min_val]\n",
155+
"for i in xrange(19):\n",
156+
" if c[i] > 1000 and c[i+1] > 1000:\n",
157+
" new_b.append(breaks[i+1])\n",
158+
"new_b.append(max_val)\n",
159+
"nbl = len(new_b)-1\n",
160+
"xr_nbl = range(nbl)\n",
161+
"names = [col + '_' + str(x) for x in xrange(nbl)]\n",
162+
"names.append(\"other\")\n",
163+
"\n",
164+
"new_col=[]\n",
165+
"\n",
166+
"\n",
167+
"for val in col_col:\n",
168+
" new_col.append(names[place(val, new_b, nbl, xr_nbl)])"
169+
]
170+
},
171+
{
172+
"cell_type": "code",
173+
"execution_count": 100,
174+
"metadata": {
175+
"collapsed": false
176+
},
177+
"outputs": [],
178+
"source": [
179+
"'''\n",
180+
"Convenience function to cut a numeric column into intervals, creating a new categorical.\n",
181+
"Uses h2o.hist to generate a histogram, with the buckets forming the categories of our new categorical.\n",
182+
"Uses h2o.cut to do the split\n",
183+
"Picks buckets based on training data, then applies the same classification to the test and validation sets\n",
184+
"\n",
185+
"Assumes that train, test, valid will have the same histogram behavior.\n",
186+
"'''\n",
187+
"def cut_column(train_df, test_df, valid_df, col):\n",
188+
" only_col= train_df[col] #Isolate the column in question from the training frame\n",
189+
" counts, breaks = np.histogram(only_col, bins=20) #Generate counts and breaks for our histogram\n",
190+
" min_val = min(col_col)-1 #Establish min and max values\n",
191+
" max_val = max(col_col)+1\n",
192+
" \n",
193+
" new_b = [min_val] #Redefine breaks such that each bucket has enough support\n",
194+
" for i in xrange(19):\n",
195+
" if c[i] > 1000 and c[i+1] > 1000:\n",
196+
" new_b.append(breaks[i+1])\n",
197+
" new_b.append(max_val)\n",
198+
" \n",
199+
" nbl = len(new_b)-1 #Cache bucket count and range(count) for performance reasons\n",
200+
" xr_nbl = range(nbl)\n",
201+
" \n",
202+
" \n",
203+
" names = [col + '_' + str(x) for x in xrange(nbl)] #Generate names for buckets, these will be categorical names\n",
204+
" names.append(\"other\") #Add 'other' bucket for everything not within min/max\n",
205+
"\n",
206+
" train_col=[] #initialize new columns for categoricals\n",
207+
" test_col=[]\n",
208+
" valid_col=[]\n",
209+
" \n",
210+
" for val in only_col:\n",
211+
" train_col.append(names[place(val, new_b, #populate categorical column for train\n",
212+
" nbl, xr_nbl)])\n",
213+
" \n",
214+
" for val in test_df[col]:\n",
215+
" test_col.append(names[place(val, new_b, #populate categorical column for train\n",
216+
" nbl, xr_nbl)])\n",
217+
" for val in valid_df[col]:\n",
218+
" valid_col.append(names[place(val, new_b, #populate categorical column for train\n",
219+
" nbl, xr_nbl)])\n",
220+
" \n",
221+
" train_df[col] = train_col\n",
222+
" test_df[col] = test_col\n",
223+
" valid_df[col] = valid_col"
224+
]
225+
},
226+
{
227+
"cell_type": "code",
228+
"execution_count": 101,
229+
"metadata": {
230+
"collapsed": false
231+
},
232+
"outputs": [
233+
{
234+
"name": "stdout",
235+
"output_type": "stream",
236+
"text": [
237+
"Wall time: 8.71 s\n"
238+
]
239+
}
240+
],
241+
"source": [
242+
"%%time\n",
243+
"cut_column(train_df, test_df, valid_df, 'Elevation')"
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": 98,
249+
"metadata": {
250+
"collapsed": false
251+
},
252+
"outputs": [
253+
{
254+
"data": {
255+
"text/plain": [
256+
"2623"
257+
]
258+
},
259+
"execution_count": 98,
260+
"metadata": {},
261+
"output_type": "execute_result"
262+
}
263+
],
264+
"source": [
265+
"1792+483+193+155"
266+
]
267+
},
268+
{
269+
"cell_type": "code",
270+
"execution_count": null,
271+
"metadata": {
272+
"collapsed": true
273+
},
274+
"outputs": [],
275+
"source": []
276+
},
277+
{
278+
"cell_type": "code",
279+
"execution_count": null,
280+
"metadata": {
281+
"collapsed": true
282+
},
283+
"outputs": [],
284+
"source": []
285+
},
286+
{
287+
"cell_type": "code",
288+
"execution_count": null,
289+
"metadata": {
290+
"collapsed": true
291+
},
292+
"outputs": [],
293+
"source": []
294+
},
295+
{
296+
"cell_type": "code",
297+
"execution_count": null,
298+
"metadata": {
299+
"collapsed": true
300+
},
301+
"outputs": [],
302+
"source": []
303+
},
304+
{
305+
"cell_type": "code",
306+
"execution_count": null,
307+
"metadata": {
308+
"collapsed": true
309+
},
310+
"outputs": [],
311+
"source": []
312+
},
313+
{
314+
"cell_type": "code",
315+
"execution_count": null,
316+
"metadata": {
317+
"collapsed": true
318+
},
319+
"outputs": [],
320+
"source": []
321+
}
322+
],
323+
"metadata": {
324+
"kernelspec": {
325+
"display_name": "Python 2",
326+
"language": "python",
327+
"name": "python2"
328+
},
329+
"language_info": {
330+
"codemirror_mode": {
331+
"name": "ipython",
332+
"version": 2
333+
},
334+
"file_extension": ".py",
335+
"mimetype": "text/x-python",
336+
"name": "python",
337+
"nbconvert_exporter": "python",
338+
"pygments_lexer": "ipython2",
339+
"version": "2.7.10"
340+
}
341+
},
342+
"nbformat": 4,
343+
"nbformat_minor": 0
344+
}

0 commit comments

Comments
 (0)