提交 835e55d4 编写于 作者: J Javier

fixing bugs

上级 39964f87
......@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {
"collapsed": false
},
......@@ -187,7 +187,7 @@
"4 0 "
]
},
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
......@@ -220,7 +220,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {
"collapsed": true
},
......@@ -247,7 +247,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {
"collapsed": true
},
......@@ -276,7 +276,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {
"collapsed": false
},
......@@ -306,7 +306,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"metadata": {
"collapsed": false
},
......@@ -322,7 +322,7 @@
"Name: education_occupation, dtype: object"
]
},
"execution_count": 7,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
......@@ -349,7 +349,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"metadata": {
"collapsed": true
},
......@@ -405,17 +405,38 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:444: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
" warnings.warn(msg, DataConversionWarning)\n",
"/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
]
}
],
"source": [
"# select the deep_cols and get the column index that will be use later\n",
"# to slice the tensors\n",
"df_deep = df_tmp[deep_cols]\n",
"deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}\n",
"\n",
"# The continous columns will be concatenated with the embeddings, so you\n",
"# might want to normalize them first\n",
"from sklearn.preprocessing import StandardScaler\n",
"scaler = StandardScaler()\n",
"for cc in continuous_cols:\n",
" df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))\n",
"\n",
"df_wide = df_tmp[wide_cols+crossed_columns]\n",
"del(df_tmp)\n",
"\n",
......@@ -476,13 +497,19 @@
" ..., \n",
" [40, 40, 0, ..., 0, 0, 0],\n",
" [45, 37, 1, ..., 0, 0, 0],\n",
" [40, 45, 1, ..., 0, 0, 0]]), deep=array([[ 3, 1, 6, ..., 0, 46, 50],\n",
" [ 0, 0, 2, ..., 0, 32, 45],\n",
" [ 1, 4, 2, ..., 0, 30, 30],\n",
" [40, 45, 1, ..., 0, 0, 0]]), deep=array([[ 3. , 1. , 6. , ..., 0. ,\n",
" 0.53655844, 0.77292975],\n",
" [ 0. , 0. , 2. , ..., 0. ,\n",
" -0.48456647, 0.36942139],\n",
" [ 1. , 4. , 2. , ..., 0. ,\n",
" -0.63044146, -0.84110367],\n",
" ..., \n",
" [ 1, 0, 2, ..., 0, 40, 40],\n",
" [ 0, 1, 2, ..., 0, 45, 37],\n",
" [ 0, 1, 2, ..., 0, 40, 45]]), labels=array([1, 0, 0, ..., 0, 0, 0]))\n"
" [ 1. , 0. , 2. , ..., 0. ,\n",
" 0.09893348, -0.03408696],\n",
" [ 0. , 1. , 2. , ..., 0. ,\n",
" 0.46362095, -0.27619198],\n",
" [ 0. , 1. , 2. , ..., 0. ,\n",
" 0.09893348, 0.36942139]]), labels=array([1, 0, 0, ..., 0, 0, 0]))\n"
]
}
],
......@@ -501,7 +528,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[('workclass', 9, 10), ('education', 16, 10), ('native_country', 42, 10), ('relationship', 6, 8), ('occupation', 15, 10)]\n"
"[('workclass', 9, 10), ('education', 16, 10), ('native_country', 42, 12), ('relationship', 6, 8), ('occupation', 15, 10)]\n"
]
}
],
......
......@@ -20,9 +20,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 26,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
......@@ -43,7 +43,7 @@
"target = 'income_label'\n",
"method = 'logistic'\n",
"\n",
"wd_dataset = prepare_data(DF, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target)"
"wd_dataset = prepare_data(DF, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target,scale=True)"
]
},
{
......@@ -55,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 27,
"metadata": {
"collapsed": false
},
......@@ -69,16 +69,22 @@
" ..., \n",
" [40, 40, 0, ..., 0, 0, 0],\n",
" [45, 37, 1, ..., 0, 0, 0],\n",
" [40, 45, 1, ..., 0, 0, 0]]), deep=array([[ 3, 1, 6, ..., 0, 46, 50],\n",
" [ 0, 0, 2, ..., 0, 32, 45],\n",
" [ 1, 4, 2, ..., 0, 30, 30],\n",
" [40, 45, 1, ..., 0, 0, 0]]), deep=array([[ 3. , 1. , 6. , ..., 0. ,\n",
" 0.53655844, 0.77292975],\n",
" [ 0. , 0. , 2. , ..., 0. ,\n",
" -0.48456647, 0.36942139],\n",
" [ 1. , 4. , 2. , ..., 0. ,\n",
" -0.63044146, -0.84110367],\n",
" ..., \n",
" [ 1, 0, 2, ..., 0, 40, 40],\n",
" [ 0, 1, 2, ..., 0, 45, 37],\n",
" [ 0, 1, 2, ..., 0, 40, 45]]), labels=array([1, 0, 0, ..., 0, 0, 0]))"
" [ 1. , 0. , 2. , ..., 0. ,\n",
" 0.09893348, -0.03408696],\n",
" [ 0. , 1. , 2. , ..., 0. ,\n",
" 0.46362095, -0.27619198],\n",
" [ 0. , 1. , 2. , ..., 0. ,\n",
" 0.09893348, 0.36942139]]), labels=array([1, 0, 0, ..., 0, 0, 0]))"
]
},
"execution_count": 2,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
......@@ -98,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 28,
"metadata": {
"collapsed": false
},
......@@ -110,7 +116,7 @@
"<IPython.core.display.Image object>"
]
},
"execution_count": 7,
"execution_count": 28,
"metadata": {
"image/png": {
"height": 500,
......@@ -154,7 +160,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 29,
"metadata": {
"collapsed": false
},
......@@ -187,7 +193,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 30,
"metadata": {
"collapsed": true
},
......@@ -228,7 +234,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 31,
"metadata": {
"collapsed": false
},
......@@ -241,7 +247,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 32,
"metadata": {
"collapsed": false
},
......@@ -271,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 33,
"metadata": {
"collapsed": false
},
......@@ -282,7 +288,7 @@
"(34189, 1)"
]
},
"execution_count": 19,
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
......@@ -293,7 +299,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 34,
"metadata": {
"collapsed": false
},
......@@ -310,7 +316,7 @@
" [ 0, 40, 45, ..., 0, 0, 0]])"
]
},
"execution_count": 20,
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
......@@ -331,35 +337,25 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python2.7/site-packages/torch/nn/functional.py:767: UserWarning: Using a target size (torch.Size([64])) that is different to the input size (torch.Size([64, 1])) is deprecated. Please ensure they have the same size.\n",
" \"Please ensure they have the same size.\".format(target.size(), input.size()))\n",
"/usr/local/lib/python2.7/site-packages/torch/nn/functional.py:767: UserWarning: Using a target size (torch.Size([13])) that is different to the input size (torch.Size([13, 1])) is deprecated. Please ensure they have the same size.\n",
" \"Please ensure they have the same size.\".format(target.size(), input.size()))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 of 10, Loss: 0.312, accuracy: 0.7794\n",
"Epoch 2 of 10, Loss: 0.405, accuracy: 0.8197\n",
"Epoch 3 of 10, Loss: 0.507, accuracy: 0.8274\n",
"Epoch 4 of 10, Loss: 0.267, accuracy: 0.8307\n",
"Epoch 5 of 10, Loss: 0.458, accuracy: 0.8328\n",
"Epoch 6 of 10, Loss: 0.402, accuracy: 0.8327\n",
"Epoch 7 of 10, Loss: 0.401, accuracy: 0.835\n",
"Epoch 8 of 10, Loss: 0.614, accuracy: 0.8358\n",
"Epoch 9 of 10, Loss: 0.425, accuracy: 0.8363\n",
"Epoch 10 of 10, Loss: 0.137, accuracy: 0.8364\n"
"Epoch 1 of 10, Loss: 0.378, accuracy: 0.7799\n",
"Epoch 2 of 10, Loss: 0.622, accuracy: 0.819\n",
"Epoch 3 of 10, Loss: 0.483, accuracy: 0.8264\n",
"Epoch 4 of 10, Loss: 0.46, accuracy: 0.83\n",
"Epoch 5 of 10, Loss: 0.378, accuracy: 0.8329\n",
"Epoch 6 of 10, Loss: 0.126, accuracy: 0.8337\n",
"Epoch 7 of 10, Loss: 0.318, accuracy: 0.8354\n",
"Epoch 8 of 10, Loss: 0.311, accuracy: 0.8346\n",
"Epoch 9 of 10, Loss: 0.262, accuracy: 0.8354\n",
"Epoch 10 of 10, Loss: 0.38, accuracy: 0.8361\n"
]
}
],
......@@ -408,7 +404,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 36,
"metadata": {
"collapsed": false
},
......@@ -438,7 +434,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 37,
"metadata": {
"collapsed": false
},
......@@ -466,7 +462,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 38,
"metadata": {
"collapsed": false
},
......@@ -511,7 +507,7 @@
"\n",
" def forward(self, X):\n",
"\n",
" emb = [getattr(self, 'emb_layer_'+col)(X[:,self.deep_column_idx[col]])\n",
" emb = [getattr(self, 'emb_layer_'+col)(X[:,self.deep_column_idx[col]].long())\n",
" for col,_,_ in self.embeddings_input]\n",
"\n",
" cont_idx = [self.deep_column_idx[col] for col in self.continuous_cols]\n",
......@@ -537,7 +533,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 39,
"metadata": {
"collapsed": false
},
......@@ -551,7 +547,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 40,
"metadata": {
"collapsed": false
},
......@@ -593,7 +589,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 41,
"metadata": {
"collapsed": false
},
......@@ -601,16 +597,22 @@
{
"data": {
"text/plain": [
"array([[ 1, 3, 1, ..., 0, 46, 50],\n",
" [ 0, 0, 0, ..., 0, 32, 45],\n",
" [ 0, 1, 4, ..., 0, 30, 30],\n",
"array([[ 1. , 3. , 1. , ..., 0. ,\n",
" 0.53655844, 0.77292975],\n",
" [ 0. , 0. , 0. , ..., 0. ,\n",
" -0.48456647, 0.36942139],\n",
" [ 0. , 1. , 4. , ..., 0. ,\n",
" -0.63044146, -0.84110367],\n",
" ..., \n",
" [ 0, 1, 0, ..., 0, 40, 40],\n",
" [ 0, 0, 1, ..., 0, 45, 37],\n",
" [ 0, 0, 1, ..., 0, 40, 45]])"
" [ 0. , 1. , 0. , ..., 0. ,\n",
" 0.09893348, -0.03408696],\n",
" [ 0. , 0. , 1. , ..., 0. ,\n",
" 0.46362095, -0.27619198],\n",
" [ 0. , 0. , 1. , ..., 0. ,\n",
" 0.09893348, 0.36942139]])"
]
},
"execution_count": 30,
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
......@@ -622,7 +624,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 42,
"metadata": {
"collapsed": false
},
......@@ -631,16 +633,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 of 10, Loss: 0.513, accuracy: 0.8063\n",
"Epoch 2 of 10, Loss: 0.274, accuracy: 0.8308\n",
"Epoch 3 of 10, Loss: 0.202, accuracy: 0.8359\n",
"Epoch 4 of 10, Loss: 0.367, accuracy: 0.8378\n",
"Epoch 5 of 10, Loss: 0.425, accuracy: 0.8386\n",
"Epoch 6 of 10, Loss: 0.321, accuracy: 0.839\n",
"Epoch 7 of 10, Loss: 0.398, accuracy: 0.8401\n",
"Epoch 8 of 10, Loss: 0.587, accuracy: 0.8396\n",
"Epoch 9 of 10, Loss: 0.59, accuracy: 0.8419\n",
"Epoch 10 of 10, Loss: 0.378, accuracy: 0.8421\n"
"Epoch 1 of 10, Loss: 0.314, accuracy: 0.8225\n",
"Epoch 2 of 10, Loss: 0.444, accuracy: 0.8373\n",
"Epoch 3 of 10, Loss: 0.448, accuracy: 0.8417\n",
"Epoch 4 of 10, Loss: 0.264, accuracy: 0.8418\n",
"Epoch 5 of 10, Loss: 0.146, accuracy: 0.8443\n",
"Epoch 6 of 10, Loss: 0.4, accuracy: 0.8449\n",
"Epoch 7 of 10, Loss: 0.52, accuracy: 0.8459\n",
"Epoch 8 of 10, Loss: 0.428, accuracy: 0.8472\n",
"Epoch 9 of 10, Loss: 0.565, accuracy: 0.8453\n",
"Epoch 10 of 10, Loss: 0.27, accuracy: 0.8474\n"
]
}
],
......@@ -674,25 +676,6 @@
" n_epochs, round(loss.data[0],3), round(correct/total,4)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**IMPORTANT NOTE**: embedding layers require `LongTensors` as inputs (so you can do the embedding look-up). In this case, we combine the embeddings inputs (workclass, education, etc...) with `age` and `hours per week`. These are integers and can be passed as `LongTensors` and transformed later to `float` (so they can be concatenated with the dense embeddings). If you wanted to combine float features with the embedding inputs, there are 2 simple changes of the code you would have to made: \n",
"\n",
"1. Adapt the `prepare_data` function so that the it returns a dictionary where the datasets would have to include: `(X_wide, X_embedding, X_continuous, y_train)` as oppopsed to `(X_wide, X_embedding, y_train)` as they do now.\n",
"\n",
"2. The `forward` method should receive two inputs and be adapted like this:\n",
"\n",
" def forward(self, X_emb, X_cont):\n",
"\n",
" emb = [getattr(self, 'emb_layer_'+col)(X_emb[:,self.deep_column_idx[col]])\n",
" for col,_,_ in self.embeddings_input]\n",
" deep_inp = torch.cat(emb+X_cont, 1)\n",
"\n",
"*Alternatively, a way around would be to round the `float` features in your data frame, cast them as `int`, and proceed normally.*"
]
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -715,7 +698,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 43,
"metadata": {
"collapsed": true
},
......@@ -745,7 +728,7 @@
"\n",
" def forward(self, X_w, X_d):\n",
"\n",
" emb = [getattr(self, 'emb_layer_'+col)(X_d[:,self.deep_column_idx[col]])\n",
" emb = [getattr(self, 'emb_layer_'+col)(X_d[:,self.deep_column_idx[col]].long())\n",
" for col,_,_ in self.embeddings_input]\n",
"\n",
" cont_idx = [self.deep_column_idx[col] for col in self.continuous_cols]\n",
......@@ -757,7 +740,7 @@
" for i in range(1,len(self.hidden_layers)):\n",
" x_deep = F.relu( getattr(self, 'linear_'+str(i+1))(x_deep) )\n",
"\n",
" wide_deep_input = torch.cat([x_deep, X_w], 1)\n",
" wide_deep_input = torch.cat([x_deep, X_w.float()], 1)\n",
"\n",
" out = F.sigmoid(self.output(wide_deep_input))\n",
"\n",
......@@ -773,7 +756,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 44,
"metadata": {
"collapsed": false
},
......@@ -784,7 +767,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 45,
"metadata": {
"collapsed": false
},
......@@ -804,7 +787,7 @@
")"
]
},
"execution_count": 34,
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
......@@ -824,7 +807,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 46,
"metadata": {
"collapsed": false
},
......@@ -871,7 +854,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 47,
"metadata": {
"collapsed": false
},
......@@ -880,16 +863,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 of 10, Loss: 0.26, accuracy: 0.8148\n",
"Epoch 2 of 10, Loss: 0.483, accuracy: 0.8337\n",
"Epoch 3 of 10, Loss: 0.205, accuracy: 0.8377\n",
"Epoch 4 of 10, Loss: 0.412, accuracy: 0.8391\n",
"Epoch 5 of 10, Loss: 0.334, accuracy: 0.84\n",
"Epoch 6 of 10, Loss: 0.302, accuracy: 0.8419\n",
"Epoch 7 of 10, Loss: 0.255, accuracy: 0.8411\n",
"Epoch 8 of 10, Loss: 0.483, accuracy: 0.8419\n",
"Epoch 9 of 10, Loss: 0.115, accuracy: 0.8425\n",
"Epoch 10 of 10, Loss: 0.147, accuracy: 0.8439\n"
"Epoch 1 of 10, Loss: 0.502, accuracy: 0.824\n",
"Epoch 2 of 10, Loss: 0.472, accuracy: 0.8377\n",
"Epoch 3 of 10, Loss: 0.463, accuracy: 0.8405\n",
"Epoch 4 of 10, Loss: 0.096, accuracy: 0.8418\n",
"Epoch 5 of 10, Loss: 0.454, accuracy: 0.8426\n",
"Epoch 6 of 10, Loss: 0.235, accuracy: 0.8441\n",
"Epoch 7 of 10, Loss: 0.302, accuracy: 0.8452\n",
"Epoch 8 of 10, Loss: 0.342, accuracy: 0.845\n",
"Epoch 9 of 10, Loss: 0.405, accuracy: 0.8453\n",
"Epoch 10 of 10, Loss: 0.23, accuracy: 0.8477\n"
]
}
],
......@@ -903,7 +886,7 @@
" correct=0\n",
" for i, (X_wide, X_deep, target) in enumerate(train_loader):\n",
" X_d = Variable(X_deep)\n",
" X_w = Variable(X_wide).float()\n",
" X_w = Variable(X_wide)\n",
" y = Variable(target).float()\n",
"\n",
" optimizer.zero_grad()\n",
......
......@@ -6,7 +6,7 @@
"source": [
"# How to use the model\n",
"\n",
"To understand the model it would be convenient if you have gone through demo1 and 2, however can learn how to use the model simply reading this notebook. \n",
"To understand the model it would be convenient if you have gone through demo1 and 2, however you can learn how to use the model simply reading this notebook. \n",
"\n",
"I will use 3 examples to illustrate the different set-ups that can be used with this pytorch implementation of wide and deep."
]
......@@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 8,
"metadata": {
"collapsed": false
},
......@@ -196,7 +196,7 @@
"4 0 "
]
},
"execution_count": 37,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
......@@ -215,7 +215,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Logistic regression with varying embedding dimensions and no dropout"
"## 1. Logistic regression with varying embedding dimensions, no dropout and Adam optimizer."
]
},
{
......@@ -227,7 +227,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 9,
"metadata": {
"collapsed": true
},
......@@ -256,16 +256,16 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 10,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"from wide_deep.data_utils import prepare_data\n",
"\n",
"# just call prepare_data\n",
"wd_dataset = prepare_data(DF, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target)"
"wd_dataset = prepare_data(DF, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target,scale=True)"
]
},
{
......@@ -277,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 11,
"metadata": {
"collapsed": false
},
......@@ -309,7 +309,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 12,
"metadata": {
"collapsed": false
},
......@@ -344,7 +344,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 13,
"metadata": {
"collapsed": false
},
......@@ -363,17 +363,17 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 of 10, Loss: 0.425, accuracy: 0.8086\n",
"Epoch 2 of 10, Loss: 0.173, accuracy: 0.8361\n",
"Epoch 3 of 10, Loss: 0.251, accuracy: 0.8384\n",
"Epoch 4 of 10, Loss: 0.237, accuracy: 0.8405\n",
"Epoch 5 of 10, Loss: 0.112, accuracy: 0.8404\n",
"Epoch 6 of 10, Loss: 0.188, accuracy: 0.8413\n",
"Epoch 7 of 10, Loss: 0.074, accuracy: 0.8423\n",
"Epoch 8 of 10, Loss: 0.17, accuracy: 0.8432\n",
"Epoch 9 of 10, Loss: 0.228, accuracy: 0.8428\n",
"Epoch 10 of 10, Loss: 0.376, accuracy: 0.8439\n",
"0.837029959735\n"
"Epoch 1 of 10, Loss: 0.215, accuracy: 0.8175\n",
"Epoch 2 of 10, Loss: 0.356, accuracy: 0.8396\n",
"Epoch 3 of 10, Loss: 0.229, accuracy: 0.842\n",
"Epoch 4 of 10, Loss: 0.531, accuracy: 0.8425\n",
"Epoch 5 of 10, Loss: 0.197, accuracy: 0.8438\n",
"Epoch 6 of 10, Loss: 0.134, accuracy: 0.844\n",
"Epoch 7 of 10, Loss: 0.454, accuracy: 0.8463\n",
"Epoch 8 of 10, Loss: 0.156, accuracy: 0.8464\n",
"Epoch 9 of 10, Loss: 0.217, accuracy: 0.8452\n",
"Epoch 10 of 10, Loss: 0.445, accuracy: 0.8472\n",
"0.838258377124\n"
]
}
],
......@@ -398,7 +398,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 14,
"metadata": {
"collapsed": false
},
......@@ -406,41 +406,41 @@
{
"data": {
"text/plain": [
"{'10th': array([ 1.0558697 , -0.10497121, 1.2519902 , -1.20969331, -0.37003803,\n",
" 0.26222366, 1.39537013, 0.66922128, -1.14872277, -1.66497922], dtype=float32),\n",
" '11th': array([ 0.76582593, -0.15720901, -0.79173702, 0.17092067, -1.01140571,\n",
" -0.15254961, 1.59629261, -1.03472006, -0.1246258 , 0.87272727], dtype=float32),\n",
" '12th': array([ 2.80748963, -0.40501541, -1.66380119, 1.119385 , 0.11228444,\n",
" 0.46560571, -0.2575815 , -0.78553766, 0.40721282, 2.17365384], dtype=float32),\n",
" '1st-4th': array([-0.59988064, -0.91489893, 0.77964532, 1.34235549, -2.21585774,\n",
" -1.20931304, 1.87390292, 0.40189996, -1.43448257, 0.0121912 ], dtype=float32),\n",
" '5th-6th': array([ 0.13168913, 0.50879979, 0.44774669, 0.75261694, -2.11371017,\n",
" -0.86445326, -0.59014183, -1.84488511, -0.8879115 , -0.68353879], dtype=float32),\n",
" '7th-8th': array([ 1.42483819, 0.34507382, -0.05195802, 1.38898981, 0.17512439,\n",
" -0.58219528, 0.94600356, -0.67991239, -1.80070949, -0.68990695], dtype=float32),\n",
" '9th': array([ 0.84937495, 0.18928385, -0.8980329 , 1.20929003, 0.22811069,\n",
" 0.35240394, 0.84941047, -0.69901848, 0.07588249, -0.27054811], dtype=float32),\n",
" 'Assoc-acdm': array([ 0.05810629, -1.25012755, -1.05227268, -0.00666486, 0.88830411,\n",
" 0.50737596, 0.67054886, 0.26397765, -0.09015059, 0.44837326], dtype=float32),\n",
" 'Assoc-voc': array([ 1.01582098, 0.40545571, 0.96072149, 0.17280895, -0.12402227,\n",
" 0.0368996 , 0.57116669, 1.57069802, -0.2876817 , 0.8799817 ], dtype=float32),\n",
" 'Bachelors': array([ 0.35576788, 0.18159543, 0.07858612, 1.12478256, 0.12776014,\n",
" 0.41710249, -1.09058726, 1.38790727, 0.34605154, -0.70506179], dtype=float32),\n",
" 'Doctorate': array([-2.7622664 , 1.56626964, 0.48016456, -0.16346474, 1.68042314,\n",
" 0.9269141 , -0.79821414, -1.53146839, 1.99243569, 0.9012208 ], dtype=float32),\n",
" 'HS-grad': array([ 0.06493477, -1.69434929, -0.108916 , -0.44833779, 1.59829664,\n",
" 0.19638543, 0.98757291, -0.75816447, -2.88351798, -0.03027572], dtype=float32),\n",
" 'Masters': array([-1.17790508, 2.30469227, 2.46537971, -0.02742275, 0.41417554,\n",
" 0.50062221, 0.6047889 , 0.65633202, 1.04308689, -0.82801151], dtype=float32),\n",
" 'Preschool': array([ 0.46218312, -1.3821547 , -1.45895326, 1.00207102, -2.59209466,\n",
" -2.07303119, 0.87239748, -0.24926367, -1.40725338, 0.19515684], dtype=float32),\n",
" 'Prof-school': array([-1.64041114, -0.00505874, 0.10719328, -0.18879749, 1.84211004,\n",
" -1.14806819, -1.57541323, 0.30641121, -0.08031298, 0.96996194], dtype=float32),\n",
" 'Some-college': array([-0.55294096, -0.76894253, 1.35381234, 0.36266825, -1.00209892,\n",
" 0.81457275, -0.71004349, -0.8900364 , 0.10882758, -1.56127894], dtype=float32)}"
"{'10th': array([-0.18979575, 1.4436841 , -0.50139612, -0.85227281, 1.36461151,\n",
" 0.3559041 , -0.58077377, 0.57836998, 0.09822965, 0.45356399], dtype=float32),\n",
" '11th': array([ 0.45051831, -1.17895794, -0.70969492, -0.41443011, -0.54592711,\n",
" 2.06732845, 0.97312623, -1.66578746, 0.15288909, -0.13219695], dtype=float32),\n",
" '12th': array([-0.55539042, 1.34430635, -0.14818592, -1.01501787, -1.85061646,\n",
" -1.42545903, 0.30155715, 1.02573991, -0.42215505, 1.02378154], dtype=float32),\n",
" '1st-4th': array([ 1.84887922, 1.20987594, 0.2984882 , -1.79686284, 0.59199595,\n",
" -0.09441201, -0.26749009, 0.20149775, -0.73544145, -0.51700133], dtype=float32),\n",
" '5th-6th': array([-0.05392418, 0.36236417, 0.47461176, 0.41363204, -0.2278301 ,\n",
" -0.5376063 , 2.63320708, 2.04696202, -0.49895033, -0.29155737], dtype=float32),\n",
" '7th-8th': array([ 0.12547047, 0.05075515, -1.44649279, -1.56195939, -1.32460868,\n",
" -0.34222227, 0.88958579, 0.47252822, -0.09495597, -0.02843619], dtype=float32),\n",
" '9th': array([-0.14014393, 1.28053474, 0.35706842, -1.89409554, -0.80370718,\n",
" -0.53732723, 0.39302668, 0.40100414, 0.96709979, -0.54595846], dtype=float32),\n",
" 'Assoc-acdm': array([ 2.23356485, 1.00816226, -2.27655983, 0.9915536 , -0.55686516,\n",
" 1.46899855, -1.43701446, -0.46746022, -0.05142261, 0.58451122], dtype=float32),\n",
" 'Assoc-voc': array([ 1.11377907, 0.41131377, -1.40442908, 0.05879473, -0.13471135,\n",
" -0.37147653, -0.39430454, -0.38298509, 0.09182382, -0.18972228], dtype=float32),\n",
" 'Bachelors': array([-2.05170012, -1.35262275, 1.57654059, 0.70553464, 0.80315828,\n",
" -2.61309099, 0.46207047, 0.25938991, 1.27118778, -2.02918983], dtype=float32),\n",
" 'Doctorate': array([-1.51427305, -0.57435513, 0.27152076, 3.68988252, -0.92434132,\n",
" 0.86953694, -0.62879491, -0.6649121 , 0.37984002, 0.42188498], dtype=float32),\n",
" 'HS-grad': array([ 0.5675661 , -0.16484176, -0.25774002, 0.12548433, 0.43557805,\n",
" 0.30828851, -0.78993368, 0.84360808, 0.93881094, 0.51104885], dtype=float32),\n",
" 'Masters': array([-0.83188468, -1.20755589, 1.29778767, -0.21757448, -0.9422332 ,\n",
" 0.28221262, -0.00386627, -0.95983386, -1.81947958, -0.21439691], dtype=float32),\n",
" 'Preschool': array([ 0.88657302, 0.83354014, 0.39644518, -1.62499583, 0.67740631,\n",
" 0.49476627, 2.9205153 , 1.38263381, 0.0637609 , -0.62043005], dtype=float32),\n",
" 'Prof-school': array([ 0.59643167, -2.98818922, 0.60524243, 0.73828989, 0.18138508,\n",
" -0.22839846, -1.02733207, -1.25480282, 1.0317198 , 0.27936444], dtype=float32),\n",
" 'Some-college': array([ 0.37755287, 0.64273852, 1.17291927, -0.04218093, 0.87608969,\n",
" 1.29980707, -0.35390925, -1.08699441, 0.38861114, -0.59682709], dtype=float32)}"
]
},
"execution_count": 29,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
......@@ -453,7 +453,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Multiclass classification with fixed embedding dimensions (10) and varying dropout"
"## 2. Multiclass classification with fixed embedding dimensions (10), varying dropout and RMSProp. "
]
},
{
......@@ -465,7 +465,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 15,
"metadata": {
"collapsed": false
},
......@@ -478,9 +478,9 @@
" (emb_layer_workclass): Embedding(9, 10)\n",
" (emb_layer_education): Embedding(16, 10)\n",
" (emb_layer_native_country): Embedding(42, 10)\n",
" (emb_layer_relationship): Embedding(6, 8)\n",
" (emb_layer_relationship): Embedding(6, 10)\n",
" (emb_layer_occupation): Embedding(15, 10)\n",
" (linear_1): Linear (49 -> 100)\n",
" (linear_1): Linear (51 -> 100)\n",
" (linear_1_drop): Dropout (p = 0.5)\n",
" (linear_2): Linear (100 -> 50)\n",
" (linear_2_drop): Dropout (p = 0.2)\n",
......@@ -498,13 +498,13 @@
"# Set the experiment\n",
"wide_cols = ['hours_per_week','education', 'relationship','workclass',\n",
" 'occupation','native_country','gender']\n",
"crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])\n",
"embeddings_cols = ['education', 'relationship','workclass','occupation','native_country']\n",
"crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])\n",
"embeddings_cols = ['education', 'relationship','workclass','occupation','native_country']\n",
"continuous_cols = [\"hours_per_week\"]\n",
"target = 'age_group'\n",
"method = 'multiclass'\n",
"\n",
"wd_dataset = prepare_data(DF,wide_cols,crossed_cols,embeddings_cols,continuous_cols,target,def_dim=10)\n",
"wd_dataset = prepare_data(DF,wide_cols,crossed_cols,embeddings_cols,continuous_cols,target,scale=True,def_dim=10)\n",
"\n",
"wide_dim = wd_dataset['train_dataset'].wide.shape[1]\n",
"n_class=3\n",
......@@ -515,7 +515,7 @@
"dropout = [0.5, 0.2]\n",
"\n",
"model = WideDeep(wide_dim,embeddings_input,continuous_cols,deep_column_idx,hidden_layers,dropout,encoding_dict,n_class)\n",
"model.compile(method=method)\n",
"model.compile(method=method, optimizer=\"RMSprop\")\n",
"\n",
"# Let's have a look to the model\n",
"print(model)"
......@@ -523,7 +523,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 16,
"metadata": {
"collapsed": false
},
......@@ -532,24 +532,24 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 of 10, Loss: 0.964, accuracy: 0.6522\n",
"Epoch 2 of 10, Loss: 1.013, accuracy: 0.6829\n",
"Epoch 3 of 10, Loss: 0.992, accuracy: 0.6873\n",
"Epoch 4 of 10, Loss: 0.991, accuracy: 0.69\n",
"Epoch 5 of 10, Loss: 1.024, accuracy: 0.693\n",
"Epoch 6 of 10, Loss: 0.706, accuracy: 0.6933\n",
"Epoch 7 of 10, Loss: 0.833, accuracy: 0.6959\n",
"Epoch 8 of 10, Loss: 0.76, accuracy: 0.6958\n",
"Epoch 9 of 10, Loss: 0.783, accuracy: 0.6971\n",
"Epoch 10 of 10, Loss: 0.898, accuracy: 0.698\n",
"Epoch 1 of 10, Loss: 0.699, accuracy: 0.6737\n",
"Epoch 2 of 10, Loss: 0.822, accuracy: 0.6855\n",
"Epoch 3 of 10, Loss: 0.717, accuracy: 0.6879\n",
"Epoch 4 of 10, Loss: 1.016, accuracy: 0.6931\n",
"Epoch 5 of 10, Loss: 0.842, accuracy: 0.6944\n",
"Epoch 6 of 10, Loss: 0.805, accuracy: 0.6942\n",
"Epoch 7 of 10, Loss: 0.783, accuracy: 0.6966\n",
"Epoch 8 of 10, Loss: 0.859, accuracy: 0.6975\n",
"Epoch 9 of 10, Loss: 0.929, accuracy: 0.6992\n",
"Epoch 10 of 10, Loss: 0.826, accuracy: 0.7006\n",
"\n",
" [[ 9.97471273e-01 2.52866116e-03 4.56306566e-08]\n",
" [ 9.44395465e-11 1.00000000e+00 5.53709922e-09]\n",
" [ 1.76757031e-09 9.99999881e-01 9.40417166e-08]\n",
" [[ 9.99074221e-01 9.25758795e-04 3.93159311e-10]\n",
" [ 2.88534306e-13 1.00000000e+00 1.56172240e-15]\n",
" [ 1.73595769e-08 1.00000000e+00 4.79524920e-10]\n",
" ..., \n",
" [ 3.58941092e-04 9.91232693e-01 8.40835553e-03]\n",
" [ 3.10289147e-06 9.99976993e-01 1.99342994e-05]\n",
" [ 5.78610539e-01 4.08240706e-01 1.31487865e-02]]\n"
" [ 8.90251540e-04 9.71086264e-01 2.80234683e-02]\n",
" [ 2.58150152e-07 9.99999106e-01 6.09748270e-07]\n",
" [ 8.45011652e-01 1.54977426e-01 1.09334724e-05]]\n"
]
}
],
......@@ -565,7 +565,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 17,
"metadata": {
"collapsed": false
},
......@@ -575,9 +575,9 @@
"output_type": "stream",
"text": [
"\n",
" 0.735653027645\n",
" 0.733689593553\n",
"\n",
" 0.703610182215\n"
" 0.700402647922\n"
]
}
],
......@@ -593,14 +593,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Linear regression with varying embedding dimensions and varying dropout\n",
"## 3. Linear regression with varying embedding dimensions and varying dropout.\n",
"\n",
"Again, bear in mind that here we use `age` as target just **for illustration purposes**"
]
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 18,
"metadata": {
"collapsed": false
},
......@@ -652,7 +652,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 19,
"metadata": {
"collapsed": false
},
......@@ -661,18 +661,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1 of 10, Loss: 293.391\n",
"Epoch 2 of 10, Loss: 190.727\n",
"Epoch 3 of 10, Loss: 229.331\n",
"Epoch 4 of 10, Loss: 186.07\n",
"Epoch 5 of 10, Loss: 192.077\n",
"Epoch 6 of 10, Loss: 59.602\n",
"Epoch 7 of 10, Loss: 178.112\n",
"Epoch 8 of 10, Loss: 137.38\n",
"Epoch 9 of 10, Loss: 135.515\n",
"Epoch 10 of 10, Loss: 66.123\n",
"Epoch 1 of 10, Loss: 151.295\n",
"Epoch 2 of 10, Loss: 108.425\n",
"Epoch 3 of 10, Loss: 82.35\n",
"Epoch 4 of 10, Loss: 36.353\n",
"Epoch 5 of 10, Loss: 50.06\n",
"Epoch 6 of 10, Loss: 147.494\n",
"Epoch 7 of 10, Loss: 176.602\n",
"Epoch 8 of 10, Loss: 167.916\n",
"Epoch 9 of 10, Loss: 40.365\n",
"Epoch 10 of 10, Loss: 107.579\n",
"\n",
" RMSE: 11.2608167188\n"
" RMSE: 11.2378476775\n"
]
}
],
......
......@@ -29,7 +29,8 @@ if __name__ == '__main__':
crossed_cols,
embeddings_cols,
continuous_cols,
target)
target,
scale=True)
# Network set up
wide_dim = wd_dataset['train_dataset'].wide.shape[1]
......
......@@ -2,9 +2,13 @@
import numpy as np
import pandas as pd
from collections import namedtuple
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None
def label_encode(df, cols=None):
"""
Helper function to label-encode some features of a given dataset.
......@@ -39,7 +43,7 @@ def label_encode(df, cols=None):
def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols, target,
def_dim=8, seed=1981):
scale=False, def_dim=8, seed=1981):
"""Prepares a pandas dataframe for the WideDeep model.
......@@ -52,6 +56,7 @@ def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols,
2 elements: (col_name, embedding dimension for this column)
continuous_cols : list with the continous column names
target (str) : the target to be fitted
scale (bool) : boolean indicating if the continuous columns must be scaled
def_dim (int) : Default dimension of the embeddings. If no embedding dimension is
included in the "embeddings_cols" input all embedding columns will use this value (8)
seed (int) : Random State for the train/test split
......@@ -72,11 +77,11 @@ def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols,
# If embeddings_cols does not include the embeddings dimensions it will be set as
# def_dim
if len(embeddings_cols[0]) == 1:
emb_dim = {e:def_dim for e in embeddings_cols}
else:
if type(embeddings_cols[0]) is tuple:
emb_dim = dict(embeddings_cols)
embeddings_cols = [emb[0] for emb in embeddings_cols]
else:
emb_dim = {e:def_dim for e in embeddings_cols}
deep_cols = embeddings_cols+continuous_cols
# Extract the target and copy the dataframe so we don't mutate it
......@@ -107,6 +112,13 @@ def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols,
df_deep = df_tmp[deep_cols]
deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
# The continous columns will be concatenated with the embeddings, so you
# probably want to normalize them first
if scale:
scaler = StandardScaler()
for cc in continuous_cols:
df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))
# select the wide_cols and one-hot encode those that are categorical
df_wide = df_tmp[wide_cols+crossed_columns]
del(df_tmp)
......
......@@ -96,7 +96,7 @@ class WideDeep(nn.Module):
self.output = nn.Linear(self.hidden_layers[-1]+self.wide_dim, self.n_class)
def compile(self, method="logistic", optimizer="Adam"):
def compile(self, method="logistic", optimizer="Adam", learning_rate=0.001, momentum=0.0):
"""Wrapper to set the activation, loss and the optimizer.
Parameters:
......@@ -112,11 +112,11 @@ class WideDeep(nn.Module):
self.activation, self.criterion = F.softmax, F.cross_entropy
if optimizer == "Adam":
self.optimizer = torch.optim.Adam(self.parameters())
self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
if optimizer == "RMSprop":
self.optimizer = torch.optim.RMSprop(self.parameters())
self.optimizer = torch.optim.RMSprop(self.parameters(), lr=learning_rate)
if optimizer == "SGD":
self.optimizer = torch.optim.SGD(self.parameters())
self.optimizer = torch.optim.SGD(self.parameters(), lr=learning_rate, momentum=momentum)
self.method = method
......@@ -134,7 +134,7 @@ class WideDeep(nn.Module):
out (torch.tensor) : result of the output neuron(s)
"""
# Deep Side
emb = [getattr(self, 'emb_layer_'+col)(X_d[:,self.deep_column_idx[col]])
emb = [getattr(self, 'emb_layer_'+col)(X_d[:,self.deep_column_idx[col]].long())
for col,_,_ in self.embeddings_input]
cont_idx = [self.deep_column_idx[col] for col in self.continuous_cols]
......@@ -150,7 +150,7 @@ class WideDeep(nn.Module):
x_deep = getattr(self, 'linear_'+str(i+1)+'_drop')(x_deep)
# Deep + Wide sides
wide_deep_input = torch.cat([x_deep, X_w], 1)
wide_deep_input = torch.cat([x_deep, X_w.float()], 1)
if not self.activation:
out = self.output(wide_deep_input)
......@@ -181,7 +181,7 @@ class WideDeep(nn.Module):
total=0
correct=0
for i, (X_wide, X_deep, target) in enumerate(train_loader):
X_w = Variable(X_wide).float()
X_w = Variable(X_wide)
X_d = Variable(X_deep)
y = (Variable(target).float() if self.method != 'multiclass' else Variable(target))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册