提交 6331cc39 编写于 作者: H Hyo-kyun Park 提交者: hyokyun-park

modifications to use latest versions of python3 & pytorch

上级 ebc1d9ca
......@@ -13,27 +13,25 @@
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
......@@ -187,7 +185,7 @@
"4 0 "
]
},
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
......@@ -220,10 +218,8 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"wide_cols = ['age','hours_per_week','education', 'relationship','workclass',\n",
......@@ -247,10 +243,8 @@
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# If embeddings_cols does not include the embeddings dimensions it will be set as\n",
......@@ -276,10 +270,8 @@
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"Y = np.array(DF[target])\n",
......@@ -306,10 +298,8 @@
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
......@@ -322,7 +312,7 @@
"Name: education_occupation, dtype: object"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
......@@ -349,10 +339,8 @@
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def label_encode(df, cols=None):\n",
......@@ -379,10 +367,10 @@
" val_types[c] = df[c].unique()\n",
"\n",
" val_to_idx = dict()\n",
" for k, v in val_types.iteritems():\n",
" for k, v in val_types.items():\n",
" val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}\n",
"\n",
" for k, v in val_to_idx.iteritems():\n",
" for k, v in val_to_idx.items():\n",
" df[k] = df[k].apply(lambda x: v[x])\n",
"\n",
" return val_to_idx, df\n",
......@@ -392,7 +380,7 @@
"encoding_dict,df_tmp = label_encode(df_tmp)\n",
"encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}\n",
"embeddings_input = []\n",
"for k,v in encoding_dict.iteritems():\n",
"for k,v in encoding_dict.items():\n",
" embeddings_input.append((k, len(v), emb_dim[k]))"
]
},
......@@ -405,29 +393,13 @@
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:444: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.\n",
" warnings.warn(msg, DataConversionWarning)\n",
"/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
]
}
],
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# select the deep_cols and get the column index that will be use later\n",
"# to slice the tensors\n",
"df_deep = df_tmp[deep_cols]\n",
"df_deep = df_tmp[deep_cols].copy()\n",
"deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}\n",
"\n",
"# The continous columns will be concatenated with the embeddings, so you\n",
......@@ -435,13 +407,11 @@
"from sklearn.preprocessing import StandardScaler\n",
"scaler = StandardScaler()\n",
"for cc in continuous_cols:\n",
" df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))\n",
"\n",
"df_wide = df_tmp[wide_cols+crossed_columns]\n",
" df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1).astype(float))\n",
"df_wide = df_tmp[wide_cols+crossed_columns]#.copy()\n",
"del(df_tmp)\n",
"\n",
"dummy_cols = [c for c in wide_cols+crossed_columns if c in categorical_columns]\n",
"df_wide = pd.get_dummies(df_wide, columns=dummy_cols)"
"df_wide = pd.get_dummies(df_wide, columns=dummy_cols)#.copy()\n"
]
},
{
......@@ -455,10 +425,8 @@
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
......@@ -482,10 +450,8 @@
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
......@@ -494,7 +460,7 @@
"train_dataset(wide=array([[46, 50, 0, ..., 0, 0, 0],\n",
" [32, 45, 1, ..., 0, 0, 0],\n",
" [30, 30, 0, ..., 0, 0, 0],\n",
" ..., \n",
" ...,\n",
" [40, 40, 0, ..., 0, 0, 0],\n",
" [45, 37, 1, ..., 0, 0, 0],\n",
" [40, 45, 1, ..., 0, 0, 0]]), deep=array([[ 3. , 1. , 6. , ..., 0. ,\n",
......@@ -503,7 +469,7 @@
" -0.48456647, 0.36942139],\n",
" [ 1. , 4. , 2. , ..., 0. ,\n",
" -0.63044146, -0.84110367],\n",
" ..., \n",
" ...,\n",
" [ 1. , 0. , 2. , ..., 0. ,\n",
" 0.09893348, -0.03408696],\n",
" [ 0. , 1. , 2. , ..., 0. ,\n",
......@@ -519,16 +485,14 @@
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('workclass', 9, 10), ('education', 16, 10), ('native_country', 42, 12), ('relationship', 6, 8), ('occupation', 15, 10)]\n"
"[('education', 16, 10), ('relationship', 6, 8), ('native_country', 42, 12), ('workclass', 9, 10), ('occupation', 15, 10)]\n"
]
}
],
......@@ -538,16 +502,14 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'hours_per_week': 6, 'native_country': 4, 'relationship': 1, 'age': 5, 'workclass': 2, 'education': 0, 'occupation': 3}\n"
"{'education': 0, 'relationship': 1, 'workclass': 2, 'occupation': 3, 'native_country': 4, 'age': 5, 'hours_per_week': 6}\n"
]
}
],
......@@ -557,105 +519,103 @@
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'education': {'10th': 12,\n",
"{'education': {'Bachelors': 0,\n",
" 'HS-grad': 1,\n",
" '11th': 2,\n",
" '12th': 15,\n",
" '1st-4th': 13,\n",
" '5th-6th': 11,\n",
" '7th-8th': 8,\n",
" 'Masters': 3,\n",
" '9th': 4,\n",
" 'Some-college': 5,\n",
" 'Assoc-acdm': 6,\n",
" 'Assoc-voc': 7,\n",
" 'Bachelors': 0,\n",
" '7th-8th': 8,\n",
" 'Doctorate': 9,\n",
" 'HS-grad': 1,\n",
" 'Masters': 3,\n",
" 'Preschool': 14,\n",
" 'Prof-school': 10,\n",
" 'Some-college': 5},\n",
" 'native_country': {'?': 4,\n",
" 'Cambodia': 17,\n",
" '5th-6th': 11,\n",
" '10th': 12,\n",
" '1st-4th': 13,\n",
" 'Preschool': 14,\n",
" '12th': 15},\n",
" 'relationship': {'Not-in-family': 0,\n",
" 'Husband': 1,\n",
" 'Wife': 2,\n",
" 'Own-child': 3,\n",
" 'Unmarried': 4,\n",
" 'Other-relative': 5},\n",
" 'native_country': {'United-States': 0,\n",
" 'Cuba': 1,\n",
" 'Jamaica': 2,\n",
" 'India': 3,\n",
" '?': 4,\n",
" 'Mexico': 5,\n",
" 'South': 6,\n",
" 'Puerto-Rico': 7,\n",
" 'Honduras': 8,\n",
" 'England': 9,\n",
" 'Canada': 10,\n",
" 'China': 28,\n",
" 'Germany': 11,\n",
" 'Iran': 12,\n",
" 'Philippines': 13,\n",
" 'Italy': 14,\n",
" 'Poland': 15,\n",
" 'Columbia': 16,\n",
" 'Cuba': 1,\n",
" 'Dominican-Republic': 24,\n",
" 'Cambodia': 17,\n",
" 'Thailand': 18,\n",
" 'Ecuador': 19,\n",
" 'Laos': 20,\n",
" 'Taiwan': 21,\n",
" 'Haiti': 22,\n",
" 'Portugal': 23,\n",
" 'Dominican-Republic': 24,\n",
" 'El-Salvador': 25,\n",
" 'England': 9,\n",
" 'France': 26,\n",
" 'Germany': 11,\n",
" 'Greece': 35,\n",
" 'Guatemala': 27,\n",
" 'Haiti': 22,\n",
" 'Holand-Netherlands': 41,\n",
" 'Honduras': 8,\n",
" 'Hong': 38,\n",
" 'Hungary': 40,\n",
" 'India': 3,\n",
" 'Iran': 12,\n",
" 'Ireland': 39,\n",
" 'Italy': 14,\n",
" 'Jamaica': 2,\n",
" 'China': 28,\n",
" 'Japan': 29,\n",
" 'Laos': 20,\n",
" 'Mexico': 5,\n",
" 'Nicaragua': 36,\n",
" 'Outlying-US(Guam-USVI-etc)': 32,\n",
" 'Yugoslavia': 30,\n",
" 'Peru': 31,\n",
" 'Philippines': 13,\n",
" 'Poland': 15,\n",
" 'Portugal': 23,\n",
" 'Puerto-Rico': 7,\n",
" 'Outlying-US(Guam-USVI-etc)': 32,\n",
" 'Scotland': 33,\n",
" 'South': 6,\n",
" 'Taiwan': 21,\n",
" 'Thailand': 18,\n",
" 'Trinadad&Tobago': 34,\n",
" 'United-States': 0,\n",
" 'Greece': 35,\n",
" 'Nicaragua': 36,\n",
" 'Vietnam': 37,\n",
" 'Yugoslavia': 30},\n",
" 'occupation': {'?': 11,\n",
" 'Adm-clerical': 0,\n",
" 'Armed-Forces': 13,\n",
" 'Craft-repair': 6,\n",
" 'Hong': 38,\n",
" 'Ireland': 39,\n",
" 'Hungary': 40,\n",
" 'Holand-Netherlands': 41},\n",
" 'workclass': {'State-gov': 0,\n",
" 'Self-emp-not-inc': 1,\n",
" 'Private': 2,\n",
" 'Federal-gov': 3,\n",
" 'Local-gov': 4,\n",
" '?': 5,\n",
" 'Self-emp-inc': 6,\n",
" 'Without-pay': 7,\n",
" 'Never-worked': 8},\n",
" 'occupation': {'Adm-clerical': 0,\n",
" 'Exec-managerial': 1,\n",
" 'Farming-fishing': 8,\n",
" 'Handlers-cleaners': 2,\n",
" 'Machine-op-inspct': 9,\n",
" 'Other-service': 4,\n",
" 'Priv-house-serv': 14,\n",
" 'Prof-specialty': 3,\n",
" 'Protective-serv': 12,\n",
" 'Other-service': 4,\n",
" 'Sales': 5,\n",
" 'Craft-repair': 6,\n",
" 'Transport-moving': 7,\n",
" 'Farming-fishing': 8,\n",
" 'Machine-op-inspct': 9,\n",
" 'Tech-support': 10,\n",
" 'Transport-moving': 7},\n",
" 'relationship': {'Husband': 1,\n",
" 'Not-in-family': 0,\n",
" 'Other-relative': 5,\n",
" 'Own-child': 3,\n",
" 'Unmarried': 4,\n",
" 'Wife': 2},\n",
" 'workclass': {'?': 5,\n",
" 'Federal-gov': 3,\n",
" 'Local-gov': 4,\n",
" 'Never-worked': 8,\n",
" 'Private': 2,\n",
" 'Self-emp-inc': 6,\n",
" 'Self-emp-not-inc': 1,\n",
" 'State-gov': 0,\n",
" 'Without-pay': 7}}"
" '?': 11,\n",
" 'Protective-serv': 12,\n",
" 'Armed-Forces': 13,\n",
" 'Priv-house-serv': 14}}"
]
},
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
......@@ -673,10 +633,8 @@
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from wide_deep.data_utils import prepare_data"
......@@ -692,23 +650,23 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"display_name": "Python 3",
"language": "python",
"name": "python2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
"nbformat_minor": 1
}
因为 它太大了无法显示 source diff 。你可以改为 查看blob
此差异已折叠。
......@@ -34,10 +34,10 @@ def label_encode(df, cols=None):
val_types[c] = df[c].unique()
val_to_idx = dict()
for k, v in val_types.iteritems():
for k, v in val_types.items():
val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
for k, v in val_to_idx.iteritems():
for k, v in val_to_idx.items():
df[k] = df[k].apply(lambda x: v[x])
return val_to_idx, df
......@@ -106,12 +106,12 @@ def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols,
encoding_dict,df_tmp = label_encode(df_tmp)
encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}
embeddings_input = []
for k,v in encoding_dict.iteritems():
for k,v in encoding_dict.items():
embeddings_input.append((k, len(v), emb_dim[k]))
# select the deep_cols and get the column index that will be use later
# to slice the tensors
df_deep = df_tmp[deep_cols]
df_deep = df_tmp[deep_cols].copy()
deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
# The continous columns will be concatenated with the embeddings, so you
......@@ -119,7 +119,7 @@ def prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols,
if scale:
scaler = StandardScaler()
for cc in continuous_cols:
df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))
df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1).astype(float))
# select the wide_cols and one-hot encode those that are categorical
df_wide = df_tmp[wide_cols+crossed_columns]
......
......@@ -8,7 +8,6 @@ import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
use_cuda = torch.cuda.is_available()
......@@ -107,7 +106,7 @@ class WideDeep(nn.Module):
if method == 'regression':
self.activation, self.criterion = None, F.mse_loss
if method == 'logistic':
self.activation, self.criterion = F.sigmoid, F.binary_cross_entropy
self.activation, self.criterion = torch.sigmoid, F.binary_cross_entropy
if method == 'multiclass':
self.activation, self.criterion = F.softmax, F.cross_entropy
......@@ -153,11 +152,13 @@ class WideDeep(nn.Module):
# Deep + Wide sides
wide_deep_input = torch.cat([x_deep, X_w.float()], 1)
if not self.activation:
out = self.output(wide_deep_input)
else:
out = self.activation(self.output(wide_deep_input))
if (self.activation==F.softmax):
out = self.activation(self.output(wide_deep_input), dim=1)
else:
out = self.activation(self.output(wide_deep_input))
return out
......@@ -191,8 +192,12 @@ class WideDeep(nn.Module):
X_w, X_d, y = X_w.cuda(), X_d.cuda(), y.cuda()
self.optimizer.zero_grad()
y_pred = net(X_w, X_d)
loss = self.criterion(y_pred, y)
y_pred = net(X_w, X_d) # [batch_size, 1]
loss = None
if(self.criterion == F.cross_entropy):
loss = self.criterion(y_pred, y) #[batch_size, 1]
else:
loss = self.criterion(y_pred, y.view(-1, 1)) #[batch_size, 1]
loss.backward()
self.optimizer.step()
......@@ -202,14 +207,14 @@ class WideDeep(nn.Module):
y_pred_cat = (y_pred > 0.5).squeeze(1).float()
if self.method == "multiclass":
_, y_pred_cat = torch.max(y_pred, 1)
correct+= float((y_pred_cat == y).sum().data[0])
correct+= float((y_pred_cat == y).sum().item())
if self.method != "regression":
print ('Epoch {} of {}, Loss: {}, accuracy: {}'.format(epoch+1,
n_epochs, round(loss.data[0],3), round(correct/total,4)))
n_epochs, round(loss.item(),3), round(correct/total,4)))
else:
print ('Epoch {} of {}, Loss: {}'.format(epoch+1, n_epochs,
round(loss.data[0],3)))
round(loss.item(),3)))
def predict(self, dataset):
......@@ -293,9 +298,9 @@ class WideDeep(nn.Module):
emb_layer = [layer for layer in emb_layers if col_name in layer[0]][0]
embeddings = emb_layer[1].cpu().data.numpy()
col_label_encoding = self.encoding_dict[col_name]
inv_dict = {v:k for k,v in col_label_encoding.iteritems()}
inv_dict = {v:k for k,v in col_label_encoding.items()}
embeddings_dict = {}
for idx,value in inv_dict.iteritems():
for idx,value in inv_dict.items():
embeddings_dict[value] = embeddings[idx]
return embeddings_dict
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册