modifications to use latest versions of python3 & pytorch

6331cc39 · Hyo-kyun Park · hyokyun-park · ebc1d9ca · 6331cc39 · 6331cc39
5 changed file
--- a/demo1_prepare_data.ipynb
+++ b/demo1_prepare_data.ipynb
@@ -13,27 +13,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 2,
+   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
@@ -187,7 +185,7 @@
       "4             0  "
      ]
     },
-     "execution_count": 1,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -220,10 +218,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 3,
+   "metadata": {},
   "outputs": [],
   "source": [
    "wide_cols = ['age','hours_per_week','education', 'relationship','workclass',\n",
@@ -247,10 +243,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 4,
+   "metadata": {},
   "outputs": [],
   "source": [
    "# If embeddings_cols does not include the embeddings dimensions it will be set as\n",
@@ -276,10 +270,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 5,
+   "metadata": {},
   "outputs": [],
   "source": [
    "Y = np.array(DF[target])\n",
@@ -306,10 +298,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 6,
+   "metadata": {},
   "outputs": [
    {
     "data": {
@@ -322,7 +312,7 @@
       "Name: education_occupation, dtype: object"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -349,10 +339,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 7,
+   "metadata": {},
   "outputs": [],
   "source": [
    "def label_encode(df, cols=None):\n",
@@ -379,10 +367,10 @@
    "        val_types[c] = df[c].unique()\n",
    "\n",
    "    val_to_idx = dict()\n",
-    "    for k, v in val_types.iteritems():\n",
+    "    for k, v in val_types.items():\n",
    "        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}\n",
    "\n",
-    "    for k, v in val_to_idx.iteritems():\n",
+    "    for k, v in val_to_idx.items():\n",
    "        df[k] = df[k].apply(lambda x: v[x])\n",
    "\n",
    "    return val_to_idx, df\n",
@@ -392,7 +380,7 @@
    "encoding_dict,df_tmp = label_encode(df_tmp)\n",
    "encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}\n",
    "embeddings_input = []\n",
-    "for k,v in encoding_dict.iteritems():\n",
+    "for k,v in encoding_dict.items():\n",
    "    embeddings_input.append((k, len(v), emb_dim[k]))"
   ]
  },
@@ -405,29 +393,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:444: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
-      "  warnings.warn(msg, DataConversionWarning)\n",
-      "/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
-     ]
-    }
-   ],
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# select the deep_cols and get the column index that will be use later\n",
    "# to slice the tensors\n",
-    "df_deep = df_tmp[deep_cols]\n",
+    "df_deep = df_tmp[deep_cols].copy()\n",
    "deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}\n",
    "\n",
    "# The continous columns will be concatenated with the embeddings, so you\n",
@@ -435,13 +407,11 @@
    "from sklearn.preprocessing import StandardScaler\n",
    "scaler = StandardScaler()\n",
    "for cc in continuous_cols:\n",
-    "    df_deep[cc]  = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))\n",
-    "\n",
-    "df_wide = df_tmp[wide_cols+crossed_columns]\n",
+    "    df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1).astype(float))\n",
+    "df_wide = df_tmp[wide_cols+crossed_columns]#.copy()\n",
    "del(df_tmp)\n",
-    "\n",
    "dummy_cols = [c for c in wide_cols+crossed_columns if c in categorical_columns]\n",
-    "df_wide = pd.get_dummies(df_wide, columns=dummy_cols)"
+    "df_wide = pd.get_dummies(df_wide, columns=dummy_cols)#.copy()\n"
   ]
  },
  {
@@ -455,10 +425,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 9,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
@@ -482,10 +450,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 10,
+   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
@@ -494,7 +460,7 @@
      "train_dataset(wide=array([[46, 50,  0, ...,  0,  0,  0],\n",
      "       [32, 45,  1, ...,  0,  0,  0],\n",
      "       [30, 30,  0, ...,  0,  0,  0],\n",
-      "       ..., \n",
+      "       ...,\n",
      "       [40, 40,  0, ...,  0,  0,  0],\n",
      "       [45, 37,  1, ...,  0,  0,  0],\n",
      "       [40, 45,  1, ...,  0,  0,  0]]), deep=array([[ 3.        ,  1.        ,  6.        , ...,  0.        ,\n",
@@ -503,7 +469,7 @@
      "        -0.48456647,  0.36942139],\n",
      "       [ 1.        ,  4.        ,  2.        , ...,  0.        ,\n",
      "        -0.63044146, -0.84110367],\n",
-      "       ..., \n",
+      "       ...,\n",
      "       [ 1.        ,  0.        ,  2.        , ...,  0.        ,\n",
      "         0.09893348, -0.03408696],\n",
      "       [ 0.        ,  1.        ,  2.        , ...,  0.        ,\n",
@@ -519,16 +485,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 11,
+   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[('workclass', 9, 10), ('education', 16, 10), ('native_country', 42, 12), ('relationship', 6, 8), ('occupation', 15, 10)]\n"
+      "[('education', 16, 10), ('relationship', 6, 8), ('native_country', 42, 12), ('workclass', 9, 10), ('occupation', 15, 10)]\n"
     ]
    }
   ],
@@ -538,16 +502,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 12,
+   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'hours_per_week': 6, 'native_country': 4, 'relationship': 1, 'age': 5, 'workclass': 2, 'education': 0, 'occupation': 3}\n"
+      "{'education': 0, 'relationship': 1, 'workclass': 2, 'occupation': 3, 'native_country': 4, 'age': 5, 'hours_per_week': 6}\n"
     ]
    }
   ],
@@ -557,105 +519,103 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": 13,
+   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "{'education': {'10th': 12,\n",
+       "{'education': {'Bachelors': 0,\n",
+       "  'HS-grad': 1,\n",
       "  '11th': 2,\n",
-       "  '12th': 15,\n",
-       "  '1st-4th': 13,\n",
-       "  '5th-6th': 11,\n",
-       "  '7th-8th': 8,\n",
+       "  'Masters': 3,\n",
       "  '9th': 4,\n",
+       "  'Some-college': 5,\n",
       "  'Assoc-acdm': 6,\n",
       "  'Assoc-voc': 7,\n",
-       "  'Bachelors': 0,\n",
+       "  '7th-8th': 8,\n",
       "  'Doctorate': 9,\n",
-       "  'HS-grad': 1,\n",
-       "  'Masters': 3,\n",
-       "  'Preschool': 14,\n",
       "  'Prof-school': 10,\n",
-       "  'Some-college': 5},\n",
-       " 'native_country': {'?': 4,\n",
-       "  'Cambodia': 17,\n",
+       "  '5th-6th': 11,\n",
+       "  '10th': 12,\n",
+       "  '1st-4th': 13,\n",
+       "  'Preschool': 14,\n",
+       "  '12th': 15},\n",
+       " 'relationship': {'Not-in-family': 0,\n",
+       "  'Husband': 1,\n",
+       "  'Wife': 2,\n",
+       "  'Own-child': 3,\n",
+       "  'Unmarried': 4,\n",
+       "  'Other-relative': 5},\n",
+       " 'native_country': {'United-States': 0,\n",
+       "  'Cuba': 1,\n",
+       "  'Jamaica': 2,\n",
+       "  'India': 3,\n",
+       "  '?': 4,\n",
+       "  'Mexico': 5,\n",
+       "  'South': 6,\n",
+       "  'Puerto-Rico': 7,\n",
+       "  'Honduras': 8,\n",
+       "  'England': 9,\n",
       "  'Canada': 10,\n",
-       "  'China': 28,\n",
+       "  'Germany': 11,\n",
+       "  'Iran': 12,\n",
+       "  'Philippines': 13,\n",
+       "  'Italy': 14,\n",
+       "  'Poland': 15,\n",
       "  'Columbia': 16,\n",
-       "  'Cuba': 1,\n",
-       "  'Dominican-Republic': 24,\n",
+       "  'Cambodia': 17,\n",
+       "  'Thailand': 18,\n",
       "  'Ecuador': 19,\n",
+       "  'Laos': 20,\n",
+       "  'Taiwan': 21,\n",
+       "  'Haiti': 22,\n",
+       "  'Portugal': 23,\n",
+       "  'Dominican-Republic': 24,\n",
       "  'El-Salvador': 25,\n",
-       "  'England': 9,\n",
       "  'France': 26,\n",
-       "  'Germany': 11,\n",
-       "  'Greece': 35,\n",
       "  'Guatemala': 27,\n",
-       "  'Haiti': 22,\n",
-       "  'Holand-Netherlands': 41,\n",
-       "  'Honduras': 8,\n",
-       "  'Hong': 38,\n",
-       "  'Hungary': 40,\n",
-       "  'India': 3,\n",
-       "  'Iran': 12,\n",
-       "  'Ireland': 39,\n",
-       "  'Italy': 14,\n",
-       "  'Jamaica': 2,\n",
+       "  'China': 28,\n",
       "  'Japan': 29,\n",
-       "  'Laos': 20,\n",
-       "  'Mexico': 5,\n",
-       "  'Nicaragua': 36,\n",
-       "  'Outlying-US(Guam-USVI-etc)': 32,\n",
+       "  'Yugoslavia': 30,\n",
       "  'Peru': 31,\n",
-       "  'Philippines': 13,\n",
-       "  'Poland': 15,\n",
-       "  'Portugal': 23,\n",
-       "  'Puerto-Rico': 7,\n",
+       "  'Outlying-US(Guam-USVI-etc)': 32,\n",
       "  'Scotland': 33,\n",
-       "  'South': 6,\n",
-       "  'Taiwan': 21,\n",
-       "  'Thailand': 18,\n",
       "  'Trinadad&Tobago': 34,\n",
-       "  'United-States': 0,\n",
+       "  'Greece': 35,\n",
+       "  'Nicaragua': 36,\n",
       "  'Vietnam': 37,\n",
-       "  'Yugoslavia': 30},\n",
-       " 'occupation': {'?': 11,\n",
-       "  'Adm-clerical': 0,\n",
-       "  'Armed-Forces': 13,\n",
-       "  'Craft-repair': 6,\n",
+       "  'Hong': 38,\n",
+       "  'Ireland': 39,\n",
+       "  'Hungary': 40,\n",
+       "  'Holand-Netherlands': 41},\n",
+       " 'workclass': {'State-gov': 0,\n",
+       "  'Self-emp-not-inc': 1,\n",
+       "  'Private': 2,\n",
+       "  'Federal-gov': 3,\n",
+       "  'Local-gov': 4,\n",
+       "  '?': 5,\n",
+       "  'Self-emp-inc': 6,\n",
+       "  'Without-pay': 7,\n",
+       "  'Never-worked': 8},\n",
+       " 'occupation': {'Adm-clerical': 0,\n",
       "  'Exec-managerial': 1,\n",
-       "  'Farming-fishing': 8,\n",
       "  'Handlers-cleaners': 2,\n",
-       "  'Machine-op-inspct': 9,\n",
-       "  'Other-service': 4,\n",
-       "  'Priv-house-serv': 14,\n",
       "  'Prof-specialty': 3,\n",
-       "  'Protective-serv': 12,\n",
+       "  'Other-service': 4,\n",
       "  'Sales': 5,\n",
+       "  'Craft-repair': 6,\n",
+       "  'Transport-moving': 7,\n",
+       "  'Farming-fishing': 8,\n",
+       "  'Machine-op-inspct': 9,\n",
       "  'Tech-support': 10,\n",
-       "  'Transport-moving': 7},\n",
-       " 'relationship': {'Husband': 1,\n",
-       "  'Not-in-family': 0,\n",
-       "  'Other-relative': 5,\n",
-       "  'Own-child': 3,\n",
-       "  'Unmarried': 4,\n",
-       "  'Wife': 2},\n",
-       " 'workclass': {'?': 5,\n",
-       "  'Federal-gov': 3,\n",
-       "  'Local-gov': 4,\n",
-       "  'Never-worked': 8,\n",
-       "  'Private': 2,\n",
-       "  'Self-emp-inc': 6,\n",
-       "  'Self-emp-not-inc': 1,\n",
-       "  'State-gov': 0,\n",
-       "  'Without-pay': 7}}"
+       "  '?': 11,\n",
+       "  'Protective-serv': 12,\n",
+       "  'Armed-Forces': 13,\n",
+       "  'Priv-house-serv': 14}}"
      ]
     },
-     "execution_count": 14,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -673,10 +633,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 14,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from wide_deep.data_utils import prepare_data"
@@ -692,23 +650,23 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
   "language": "python",
-   "name": "python2"
+   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
-    "version": 2
+    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.13"
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
--- a/demo2_building_blocks.ipynb
+++ b/demo2_building_blocks.ipynb
--- a/demo3_using_it.ipynb
+++ b/demo3_using_it.ipynb
--- a/wide_deep/data_utils.py
+++ b/wide_deep/data_utils.py
@@ -34,10 +34,10 @@ def label_encode(df, cols=None):
        val_types[c] = df[c].unique()

    val_to_idx = dict()
-    for k, v in val_types.iteritems():
+    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

-    for k, v in val_to_idx.iteritems():
+    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    return val_to_idx, df
@@ -106,12 +106,12 @@ def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols,
    encoding_dict,df_tmp = label_encode(df_tmp)
    encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}
    embeddings_input = []
-    for k,v in encoding_dict.iteritems():
+    for k,v in encoding_dict.items():
        embeddings_input.append((k, len(v), emb_dim[k]))

    # select the deep_cols and get the column index that will be use later
    # to slice the tensors
-    df_deep = df_tmp[deep_cols]
+    df_deep = df_tmp[deep_cols].copy()
    deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}

    # The continous columns will be concatenated with the embeddings, so you
@@ -119,7 +119,7 @@ def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols,
    if scale:
        scaler = StandardScaler()
        for cc in continuous_cols:
-            df_deep[cc]  = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))
+            df_deep[cc]  = scaler.fit_transform(df_deep[cc].values.reshape(-1,1).astype(float))

    # select the wide_cols and one-hot encode those that are categorical
    df_wide = df_tmp[wide_cols+crossed_columns]

--- a/wide_deep/torch_model.py
+++ b/wide_deep/torch_model.py
@@ -8,7 +8,6 @@ import torch.optim as optim
 from torch.autograd import Variable
 from torch.utils.data import Dataset, DataLoader

-
 use_cuda = torch.cuda.is_available()


@@ -107,7 +106,7 @@ class WideDeep(nn.Module):
        if method == 'regression':
            self.activation, self.criterion = None, F.mse_loss
        if method == 'logistic':
-            self.activation, self.criterion = F.sigmoid, F.binary_cross_entropy
+            self.activation, self.criterion = torch.sigmoid, F.binary_cross_entropy
        if method == 'multiclass':
            self.activation, self.criterion = F.softmax, F.cross_entropy

@@ -153,11 +152,13 @@ class WideDeep(nn.Module):

        # Deep + Wide sides
        wide_deep_input = torch.cat([x_deep, X_w.float()], 1)
-
        if not self.activation:
            out = self.output(wide_deep_input)
        else:
-            out = self.activation(self.output(wide_deep_input))
+            if (self.activation==F.softmax):
+                out = self.activation(self.output(wide_deep_input), dim=1)
+            else:
+                out = self.activation(self.output(wide_deep_input))

        return out

@@ -191,8 +192,12 @@ class WideDeep(nn.Module):
                    X_w, X_d, y = X_w.cuda(), X_d.cuda(), y.cuda()

                self.optimizer.zero_grad()
-                y_pred =  net(X_w, X_d)
-                loss = self.criterion(y_pred, y)
+                y_pred =  net(X_w, X_d) # [batch_size, 1]
+                loss = None
+                if(self.criterion == F.cross_entropy):
+                    loss = self.criterion(y_pred, y) #[batch_size, 1]
+                else:
+                    loss = self.criterion(y_pred, y.view(-1, 1)) #[batch_size, 1]
                loss.backward()
                self.optimizer.step()

@@ -202,14 +207,14 @@ class WideDeep(nn.Module):
                        y_pred_cat = (y_pred > 0.5).squeeze(1).float()
                    if self.method == "multiclass":
                        _, y_pred_cat = torch.max(y_pred, 1)
-                    correct+= float((y_pred_cat == y).sum().data[0])
+                    correct+= float((y_pred_cat == y).sum().item())

            if self.method != "regression":
                print ('Epoch {} of {}, Loss: {}, accuracy: {}'.format(epoch+1,
-                    n_epochs, round(loss.data[0],3), round(correct/total,4)))
+                    n_epochs, round(loss.item(),3), round(correct/total,4)))
            else:
                print ('Epoch {} of {}, Loss: {}'.format(epoch+1, n_epochs,
-                    round(loss.data[0],3)))
+                    round(loss.item(),3)))


    def predict(self, dataset):
@@ -293,9 +298,9 @@ class WideDeep(nn.Module):
        emb_layer  = [layer for layer in emb_layers if col_name in layer[0]][0]
        embeddings = emb_layer[1].cpu().data.numpy()
        col_label_encoding = self.encoding_dict[col_name]
-        inv_dict = {v:k for k,v in col_label_encoding.iteritems()}
+        inv_dict = {v:k for k,v in col_label_encoding.items()}
        embeddings_dict = {}
-        for idx,value in inv_dict.iteritems():
+        for idx,value in inv_dict.items():
            embeddings_dict[value] = embeddings[idx]

        return embeddings_dict