GLM cut_column done

kyanga2 · kyanga2 · commit a4cbc340aba6 · 2015-11-07T03:50:03.000-08:00
diff --git a/tutorials/glm/glm_h2oworld_demo.ipynb b/tutorials/glm/glm_h2oworld_demo.ipynb
@@ -0,0 +1,344 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import h2o\n",
+    "\n",
+    "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
+    "from h2o.estimators.random_forest import H2ORandomForestEstimator\n",
+    "from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime: </td>\n",
+       "<td>28 minutes 30 seconds 62 milliseconds </td></tr>\n",
+       "<tr><td>H2O cluster version: </td>\n",
+       "<td>3.7.0.3248</td></tr>\n",
+       "<tr><td>H2O cluster name: </td>\n",
+       "<td>H2O_started_from_python</td></tr>\n",
+       "<tr><td>H2O cluster total nodes: </td>\n",
+       "<td>1</td></tr>\n",
+       "<tr><td>H2O cluster total memory: </td>\n",
+       "<td>1.78 GB</td></tr>\n",
+       "<tr><td>H2O cluster total cores: </td>\n",
+       "<td>8</td></tr>\n",
+       "<tr><td>H2O cluster allowed cores: </td>\n",
+       "<td>8</td></tr>\n",
+       "<tr><td>H2O cluster healthy: </td>\n",
+       "<td>True</td></tr>\n",
+       "<tr><td>H2O Connection ip: </td>\n",
+       "<td>127.0.0.1</td></tr>\n",
+       "<tr><td>H2O Connection port: </td>\n",
+       "<td>54321</td></tr></table></div>"
+      ],
+      "text/plain": [
+       "--------------------------  -------------------------------------\n",
+       "H2O cluster uptime:         28 minutes 30 seconds 62 milliseconds\n",
+       "H2O cluster version:        3.7.0.3248\n",
+       "H2O cluster name:           H2O_started_from_python\n",
+       "H2O cluster total nodes:    1\n",
+       "H2O cluster total memory:   1.78 GB\n",
+       "H2O cluster total cores:    8\n",
+       "H2O cluster allowed cores:  8\n",
+       "H2O cluster healthy:        True\n",
+       "H2O Connection ip:          127.0.0.1\n",
+       "H2O Connection port:        54321\n",
+       "--------------------------  -------------------------------------"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "h2o.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Parse Progress: [##################################################] 100%\n"
+     ]
+    }
+   ],
+   "source": [
+    "h2o.remove_all()\n",
+    "covtype_df = h2o.import_file(\"../data/covtype.full.csv\")\n",
+    "#split the data as described above\n",
+    "train, valid, test = covtype_df.split_frame([0.6, 0.2], seed=1234)\n",
+    "\n",
+    "#Prepare predictors and response columns\n",
+    "covtype_X = covtype_df.col_names[:-1]     #last column is Cover_Type, our desired response variable \n",
+    "covtype_y = covtype_df.col_names[-1]    \n",
+    "\n",
+    "train_df = train.as_data_frame(True)\n",
+    "valid_df = valid.as_data_frame(True)\n",
+    "test_df = test.as_data_frame(True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "col = 'Elevation'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def place(value, breaks, len_breaks, range_cache):\n",
+    "    for k in range_cache:\n",
+    "        if value <breaks[k+1]:\n",
+    "            return k\n",
+    "    return len_breaks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wall time: 5.33 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "col_col= train.as_data_frame(True)[col]\n",
+    "c, breaks = np.histogram(col_col, bins=20)\n",
+    "min_val = min(col_col)-1\n",
+    "max_val = max(col_col)+1\n",
+    "new_b = [min_val]\n",
+    "for i in xrange(19):\n",
+    "    if c[i] > 1000 and c[i+1] > 1000:\n",
+    "        new_b.append(breaks[i+1])\n",
+    "new_b.append(max_val)\n",
+    "nbl = len(new_b)-1\n",
+    "xr_nbl = range(nbl)\n",
+    "names = [col + '_' + str(x) for x in xrange(nbl)]\n",
+    "names.append(\"other\")\n",
+    "\n",
+    "new_col=[]\n",
+    "\n",
+    "\n",
+    "for val in col_col:\n",
+    "    new_col.append(names[place(val, new_b, nbl, xr_nbl)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "Convenience function to cut a numeric column into intervals, creating a new categorical.\n",
+    "Uses h2o.hist to generate a histogram, with the buckets forming the categories of our new categorical.\n",
+    "Uses h2o.cut to do the split\n",
+    "Picks buckets based on training data, then applies the same classification to the test and validation sets\n",
+    "\n",
+    "Assumes that train, test, valid will have the same histogram behavior.\n",
+    "'''\n",
+    "def cut_column(train_df, test_df, valid_df, col):\n",
+    "    only_col= train_df[col]                            #Isolate the column in question from the training frame\n",
+    "    counts, breaks = np.histogram(only_col, bins=20)   #Generate counts and breaks for our histogram\n",
+    "    min_val = min(col_col)-1                           #Establish min and max values\n",
+    "    max_val = max(col_col)+1\n",
+    "    \n",
+    "    new_b = [min_val]                                  #Redefine breaks such that each bucket has enough support\n",
+    "    for i in xrange(19):\n",
+    "        if c[i] > 1000 and c[i+1] > 1000:\n",
+    "            new_b.append(breaks[i+1])\n",
+    "    new_b.append(max_val)\n",
+    "    \n",
+    "    nbl = len(new_b)-1                                 #Cache bucket count and range(count) for performance reasons\n",
+    "    xr_nbl = range(nbl)\n",
+    "    \n",
+    "    \n",
+    "    names = [col + '_' + str(x) for x in xrange(nbl)]  #Generate names for buckets, these will be categorical names\n",
+    "    names.append(\"other\")                              #Add 'other' bucket for everything not within min/max\n",
+    "\n",
+    "    train_col=[]                                       #initialize new columns for categoricals\n",
+    "    test_col=[]\n",
+    "    valid_col=[]\n",
+    "    \n",
+    "    for val in only_col:\n",
+    "        train_col.append(names[place(val, new_b,       #populate categorical column for train\n",
+    "                                   nbl, xr_nbl)])\n",
+    "        \n",
+    "    for val in test_df[col]:\n",
+    "        test_col.append(names[place(val, new_b,        #populate categorical column for train\n",
+    "                                   nbl, xr_nbl)])\n",
+    "    for val in valid_df[col]:\n",
+    "        valid_col.append(names[place(val, new_b,       #populate categorical column for train\n",
+    "                                   nbl, xr_nbl)])\n",
+    "        \n",
+    "    train_df[col] = train_col\n",
+    "    test_df[col] = test_col\n",
+    "    valid_df[col] = valid_col"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wall time: 8.71 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "cut_column(train_df, test_df, valid_df, 'Elevation')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2623"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "1792+483+193+155"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}