提交 7cd639c1 编写于 作者: O O2Dyokii

DCN update

上级 a4b21f81
import numpy as np
import pandas as pd
import keras.backend as K
from keras import layers
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, Embedding, Reshape, Add
from keras.layers import Flatten, merge, Lambda
from keras.models import Model
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import random
def feature_generate(data):
data, label, cate_columns, cont_columns = preprocessing(data)
embeddings_tensors = []
continuous_tensors = []
for ec in cate_columns:
layer_name = ec + '_inp'
# For categorical features, we em-bed the features in dense vectors of dimension 6×(category cardinality)**(1/4)
embed_dim = data[ec].nunique() if int(6 * np.power(data[ec].nunique(), 1/4)) > data[ec].nunique() \
else int(6 * np.power(data[ec].nunique(), 1/4))
t_inp, t_build = embedding_input(layer_name, data[ec].nunique(), embed_dim)
embeddings_tensors.append((t_inp, t_build))
del(t_inp, t_build)
for cc in cont_columns:
layer_name = cc + '_in'
t_inp, t_build = continous_input(layer_name)
continuous_tensors.append((t_inp, t_build))
del(t_inp, t_build)
inp_layer = [et[0] for et in embeddings_tensors]
inp_layer += [ct[0] for ct in continuous_tensors]
inp_embed = [et[1] for et in embeddings_tensors]
inp_embed += [ct[1] for ct in continuous_tensors]
return data, label, inp_layer, inp_embed
def embedding_input(name, n_in, n_out):
inp = Input(shape = (1, ), dtype = 'int64', name = name)
return inp, Embedding(n_in, n_out, input_length = 1)(inp)
def continous_input(name):
inp = Input(shape=(1, ), dtype = 'float32', name = name)
return inp, Reshape((1, 1))(inp)
# The optimal hyperparameter settings were 8 cross layers of size 54 and 6 deep layers of size 292 for DCN
# Embed "Soil_Type" column (embedding dim == 15), we have 8 cross layers of size 29
def fit(inp_layer, inp_embed, X, y):
#inp_layer, inp_embed = feature_generate(X, cate_columns, cont_columns)
input = merge(inp_embed, mode = 'concat')
# deep layer
for i in range(6):
if i == 0:
deep = Dense(272, activation='relu')(Flatten()(input))
else:
deep = Dense(272, activation='relu')(deep)
# cross layer
cross = CrossLayer(output_dim = input.shape[2].value, num_layer = 8, name = "cross_layer")(input)
#concat both layers
output = merge([deep, cross], mode = 'concat')
output = Dense(y.shape[1], activation = 'softmax')(output)
model = Model(inp_layer, output)
print(model.summary())
plot_model(model, to_file = 'model.png', show_shapes = True)
model.compile(Adam(0.01), loss = 'categorical_crossentropy', metrics = ["accuracy"])
model.fit([X[c] for c in X.columns], y, batch_size = 256, epochs = 10)
return model
def evaluate(X, y, model):
y_pred = model.predict([X[c] for c in X.columns])
acc = np.sum(np.argmax(y_pred, 1) == np.argmax(y, 1)) / y.shape[0]
print("Accuracy: ", acc)
# https://keras.io/layers/writing-your-own-keras-layers/
class CrossLayer(layers.Layer):
def __init__(self, output_dim, num_layer, **kwargs):
self.output_dim = output_dim
self.num_layer = num_layer
super(CrossLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.input_dim = input_shape[2]
self.W = []
self.bias = []
for i in range(self.num_layer):
self.W.append(self.add_weight(shape = [1, self.input_dim], initializer = 'glorot_uniform', name = 'w_' + str(i), trainable = True))
self.bias.append(self.add_weight(shape = [1, self.input_dim], initializer = 'zeros', name = 'b_' + str(i), trainable = True))
self.built = True
def call(self, input):
for i in range(self.num_layer):
if i == 0:
cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), x), 1, keepdims = True), self.bias[i], x]))(input)
else:
cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), input), 1, keepdims = True), self.bias[i], input]))(cross)
return Flatten()(cross)
def compute_output_shape(self, input_shape):
return (None, self.output_dim)
# modify the embedding columns here
def preprocessing(data):
# inverse transform one-hot to continuous column
df_onehot = data[[col for col in data.columns.tolist() if "Soil_Type" in col]]
#for i in df_onehot.columns.tolist():
# if df_onehot[i].sum() == 0:
# del df_onehot[i]
data["Soil"] = df_onehot.dot(np.array(range(df_onehot.columns.size))).astype(int)
data.drop([col for col in data.columns.tolist() if "Soil_Type" in col], axis = 1, inplace = True)
label = np.array(OneHotEncoder().fit_transform(data["Cover_Type"].reshape(-1, 1)).todense())
del data["Cover_Type"]
cate_columns = ["Soil"]
cont_columns = [col for col in data.columns if col != "Soil"]
# Feature normilization
scaler = StandardScaler()
data_cont = pd.DataFrame(scaler.fit_transform(data[cont_columns]), columns = cont_columns)
data_cate = data[cate_columns]
data = pd.concat([data_cate, data_cont], axis = 1)
return data, label, cate_columns, cont_columns
if __name__ == "__main__":
data = pd.read_csv("data/covtype.csv")
X, y, inp_layer, inp_embed = feature_generate(data)
#random split train and test by 9:1
train_index = random.sample(range(X.shape[0]), int(X.shape[0] * 0.9))
test_index = list(set(range(X.shape[0])) - set(train_index))
model = fit(inp_layer, inp_embed, X.iloc[train_index], y[train_index, :])
evaluate(X.iloc[test_index], y[test_index, :], model)
\ No newline at end of file
Note: https://kaiyuanyokii2n.com/DCN.html
## DCN(Deep and Cross network) demo 实现
Note: https://kaiyuanyokii2n.com/DCN.html
\ No newline at end of file
TRAIN_FILE = "data/train.csv"
TEST_FILE = "data/test.csv"
SUB_DIR = "output"
NUM_SPLITS = 3
RANDOM_SEED = 2018
# types of columns of the dataset dataframe
CATEGORICAL_COLS = [
'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat',
]
NUMERIC_COLS = [
# # binary
# "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
# "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
# "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
# "ps_ind_17_bin", "ps_ind_18_bin",
# "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
# "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
# numeric
"ps_reg_01", "ps_reg_02", "ps_reg_03",
"ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# feature engineering
"missing_feat", "ps_car_13_x_ps_reg_03",
]
IGNORE_COLS = [
"id", "target",
"ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
"ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
"ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
"ps_calc_13", "ps_calc_14",
"ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
"ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]
\ No newline at end of file
此差异已折叠。
因为 它太大了无法显示 source diff 。你可以改为 查看blob
此差异已折叠。
import pandas as pd
import numpy as np
class FeatureDict(object):
def __init__(self, trainfile=None, testfile=None,numeric_cols=[],ignore_cols=[],cate_cols=[]):
self.trainfile = trainfile
self.testfile = testfile
self.cate_cols = cate_cols
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
self.gen_feat_dict()
'''
Generate categorical deature dict
ex: df[col1] = [3,4,1,0,2];df[col2] = [-1,2,7]
generated feat_dict = {'col1':{3:0,4:1,1:2,0:3,2:4},'col2':{-1:5,2:6,7:7}}
'''
def gen_feat_dict(self):
df = pd.concat([self.trainfile,self.testfile])
self.feat_dict = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols or col in self.numeric_cols:
continue
else:
us = df[col].unique()
self.feat_dict[col] = dict(zip(us, range(tc, len(us) + tc)))
tc += len(us)
self.feat_dim = tc
class DataPaser(object):
def __init__(self,feat_dict):
self.feat_dict = feat_dict
def parse(self,df=None,has_label=False):
dfi = df.copy() # feature index
if has_label:
y = dfi['target'].values.tolist()
dfi.drop(['id','target'], axis=1, inplace=True)
else:
ids = dfi['id'].values.tolist()
dfi.drop(['id'],axis=1,inplace=True)
numeric_values = dfi[self.feat_dict.numeric_cols].values.tolist()
dfi.drop(self.feat_dict.numeric_cols, axis=1,inplace=True)
dfv = dfi.copy() # dfv for feature values which binary or float
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col, axis=1,inplace=True)
dfv.drop(col, axis=1,inplace=True)
continue
# categories feature
else:
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
cate_idx = dfi.values.tolist()
cate_values = dfv.values.tolist()
if has_label:
return cate_idx, cate_values, numeric_values, y
else:
return cate_idx, cate_values, numeric_values, ids
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mllab/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"import pandas as pd\n",
"import numpy as np\n",
"import config\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from load_data import FeatureDict, DataPaser\n",
"from model import DCN"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def load_data():\n",
" dfTrain = pd.read_csv('data/train.csv')\n",
" dfTest = pd.read_csv('data/test.csv')\n",
" \n",
" def preprocess(df):\n",
" cols = [c for c in df.columns if c not in ['id','target']]\n",
" df['missing_feat'] = np.sum((df[cols] == -1).values, axis=1)\n",
" df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']\n",
" return df\n",
" \n",
" dfTrain = preprocess(dfTrain)\n",
" dfTest = preprocess(dfTest)\n",
" \n",
" cols = [c for c in dfTrain.columns if c not in ['id','target']]\n",
" cols = [c for c in cols if (not c in config.IGNORE_COLS)]\n",
" \n",
" X_train = dfTrain[cols].values\n",
" y_train = dfTrain['target'].values\n",
" X_test = dfTest[cols].values\n",
" ids_test = dfTest['id'].values\n",
" \n",
" return dfTrain, dfTest, X_train, y_train,X_test,ids_test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def run_dcn(dfTrain, dfTest, folds, params):\n",
" fd = FeatureDict(dfTrain, dfTest, numeric_cols=config.NUMERIC_COLS, \n",
" ignore_cols=config.IGNORE_COLS, cate_cols=config.CATEGORICAL_COLS)\n",
" # print(fd.feat_dim)\n",
" # print(fd.feat_dict)\n",
" \n",
" data_parser = DataPaser(feat_dict=fd)\n",
" cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)\n",
" cate_Xi_test,cate_Xv_test,numeric_Xv_test,ids_test = data_parser.parse(df=dfTest)\n",
" \n",
" params['cate_feature_size'] = fd.feat_dim\n",
" params['field_size'] = len(cate_Xi_train[0])\n",
" params['numeric_feature_size'] = len(config.NUMERIC_COLS)\n",
" \n",
" _get = lambda x, l: [x[i] for i in l]\n",
" for i,(trn_idx,val_idx) in enumerate(folds):\n",
" cate_Xi_train_, cate_Xv_train_,numeric_Xv_train_,y_train_ = _get(cate_Xi_train, trn_idx),_get(cate_Xv_train,trn_idx),_get(numeric_Xv_train,trn_idx),_get(y_train,trn_idx)\n",
" cate_Xi_valid_, cate_Xv_valid_,numeric_Xv_valid_,y_valid_ = _get(cate_Xi_train,val_idx),_get(cate_Xi_train,val_idx),_get(numeric_Xv_train,val_idx),_get(y_train,val_idx)\n",
" \n",
" dcn = DCN(**params)\n",
" dcn.fit(cate_Xi_train_,cate_Xv_train_,numeric_Xv_train_,y_train_,cate_Xi_valid_,cate_Xv_valid_,numeric_Xv_valid_,y_valid_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Main"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dfTrain,dfTest,X_train,y_train,X_test,ids_test = load_data()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS,shuffle=True, random_state=config.RANDOM_SEED).split(X_train,y_train))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"params = {'embedding_size': 4,\n",
" 'deep_layers': [8,8],\n",
" 'dropout_deep': [0.5,0.5,0.5],\n",
" 'deep_layers_activation': tf.nn.relu,\n",
" 'epoch': 30,\n",
" 'batch_size': 128,\n",
" 'learning_rate': 0.001,\n",
" 'optimizer_type': 'adam',\n",
" 'verbose': True,\n",
" 'random_seed': config.RANDOM_SEED,\n",
" 'cross_layer_num': 3}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mllab/jupyter/Workspace/Kyuan/RS/DCN/load_data.py:20: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" df = pd.concat([self.trainfile,self.testfile])\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parames: 3259\n",
"6666\n",
"6666\n",
"6666\n",
"6666\n",
"epoch: 0 loss: [12.024529]\n",
"epoch: 1 loss: [7.773973]\n",
"epoch: 2 loss: [2.8597765]\n",
"epoch: 3 loss: [1.4841796]\n",
"epoch: 4 loss: [1.1699396]\n",
"epoch: 5 loss: [1.6370908]\n",
"epoch: 6 loss: [1.8515997]\n",
"epoch: 7 loss: [2.2488427]\n",
"epoch: 8 loss: [2.4133043]\n",
"epoch: 9 loss: [3.5436616]\n",
"epoch: 10 loss: [3.5054557]\n",
"epoch: 11 loss: [2.6502101]\n",
"epoch: 12 loss: [2.3495483]\n",
"epoch: 13 loss: [2.6154437]\n",
"epoch: 14 loss: [1.1022573]\n",
"epoch: 15 loss: [2.9973662]\n",
"epoch: 16 loss: [3.8579004]\n",
"epoch: 17 loss: [6.178442]\n",
"epoch: 18 loss: [8.025207]\n",
"epoch: 19 loss: [7.136653]\n",
"epoch: 20 loss: [7.3870573]\n",
"epoch: 21 loss: [8.595674]\n",
"epoch: 22 loss: [7.3435473]\n",
"epoch: 23 loss: [6.7827497]\n",
"epoch: 24 loss: [4.636249]\n",
"epoch: 25 loss: [3.5871704]\n",
"epoch: 26 loss: [2.8329947]\n",
"epoch: 27 loss: [4.4573736]\n",
"epoch: 28 loss: [4.7281036]\n",
"epoch: 29 loss: [4.7764482]\n",
"Parames: 3259\n",
"6667\n",
"6667\n",
"6667\n",
"6667\n",
"epoch: 0 loss: [0.6093248]\n",
"epoch: 1 loss: [1.5039687]\n",
"epoch: 2 loss: [6.712246]\n",
"epoch: 3 loss: [7.0024014]\n",
"epoch: 4 loss: [6.4801226]\n",
"epoch: 5 loss: [3.2884202]\n",
"epoch: 6 loss: [3.6269343]\n",
"epoch: 7 loss: [1.3347118]\n",
"epoch: 8 loss: [1.0590647]\n",
"epoch: 9 loss: [1.1074238]\n",
"epoch: 10 loss: [1.8569903]\n",
"epoch: 11 loss: [1.6635537]\n",
"epoch: 12 loss: [2.031083]\n",
"epoch: 13 loss: [2.0649345]\n",
"epoch: 14 loss: [3.854222]\n",
"epoch: 15 loss: [2.8241727]\n",
"epoch: 16 loss: [4.5554295]\n",
"epoch: 17 loss: [3.8784018]\n",
"epoch: 18 loss: [3.9847918]\n",
"epoch: 19 loss: [6.2238193]\n",
"epoch: 20 loss: [5.2663083]\n",
"epoch: 21 loss: [7.23936]\n",
"epoch: 22 loss: [5.730556]\n",
"epoch: 23 loss: [6.973385]\n",
"epoch: 24 loss: [6.073906]\n",
"epoch: 25 loss: [6.378568]\n",
"epoch: 26 loss: [5.058364]\n",
"epoch: 27 loss: [3.7043087]\n",
"epoch: 28 loss: [5.9491067]\n",
"epoch: 29 loss: [5.677361]\n",
"Parames: 3259\n",
"6667\n",
"6667\n",
"6667\n",
"6667\n",
"epoch: 0 loss: [0.6093248]\n",
"epoch: 1 loss: [0.6189967]\n",
"epoch: 2 loss: [0.6625199]\n",
"epoch: 3 loss: [0.9236592]\n",
"epoch: 4 loss: [0.8559564]\n",
"epoch: 5 loss: [0.88980776]\n",
"epoch: 6 loss: [1.2686524]\n",
"epoch: 7 loss: [1.5201157]\n",
"epoch: 8 loss: [1.6151947]\n",
"epoch: 9 loss: [2.6549156]\n",
"epoch: 10 loss: [4.126936]\n",
"epoch: 11 loss: [3.530216]\n",
"epoch: 12 loss: [3.5785751]\n",
"epoch: 13 loss: [6.5671687]\n",
"epoch: 14 loss: [3.3609593]\n",
"epoch: 15 loss: [4.690835]\n",
"epoch: 16 loss: [5.375682]\n",
"epoch: 17 loss: [4.978845]\n",
"epoch: 18 loss: [8.299833]\n",
"epoch: 19 loss: [7.519844]\n",
"epoch: 20 loss: [9.057664]\n",
"epoch: 21 loss: [11.291857]\n",
"epoch: 22 loss: [11.814135]\n",
"epoch: 23 loss: [8.627268]\n",
"epoch: 24 loss: [10.576701]\n",
"epoch: 25 loss: [9.666995]\n",
"epoch: 26 loss: [6.6638875]\n",
"epoch: 27 loss: [8.0759735]\n",
"epoch: 28 loss: [6.2821198]\n",
"epoch: 29 loss: [9.70084]\n"
]
}
],
"source": [
"run_dcn(dfTrain,dfTest,folds,params)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
import tensorflow as tf
import numpy as np
from time import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score
class DCN(BaseEstimator, TransformerMixin):
def __init__(self, cate_feature_size,field_size,numeric_feature_size,embedding_size=8,
deep_layers=[32,32],dropout_deep=[0.5,0.5,0.5],
deep_layers_activation=tf.nn.relu,epoch=10,batch_size=256,
learning_rate=0.01,optimizer_type='adam',
verbose=False,random_seed=2018,loss_type='logloss',
eval_metric=roc_auc_score,l2_reg=0.0,cross_layer_num=3):
assert loss_type in ["logloss", "rmse"], \
"'logloss' for classification or 'rmse' for regression"
self.cate_feature_size = cate_feature_size
self.numeric_feature_size = numeric_feature_size
self.field_size = field_size
self.embedding_size = embedding_size
self.total_size = self.field_size * self.embedding_size + self.numeric_feature_size
self.deep_layers = deep_layers
self.cross_layer_num = cross_layer_num
self.dropout_deep = dropout_deep
self.deep_layers_activation = deep_layers_activation
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.train_result,self.valid_result = [],[]
self._init_graph()
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
tf.set_random_seed(self.random_seed)
self.feat_index = tf.placeholder(tf.int32, shape=[None,None], name='feat_index')
self.feat_value = tf.placeholder(tf.float32, shape=[None,None], name='feat_value')
self.numeric_value = tf.placeholder(tf.float32, shape=[None,None],name='num_value')
self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_keep_deep')
self.train_phase = tf.placeholder(tf.bool,name='train_phase')
self.weights = self._initialize_weights()
# model
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value, shape=[-1,self.field_size,1])
self.embeddings = tf.multiply(self.embeddings,feat_value)
self.x0 = tf.concat([self.numeric_value,tf.reshape(self.embeddings,shape=[-1,self.field_size * self.embedding_size])], axis=1)
# deep network
self.y_deep = tf.nn.dropout(self.x0,self.dropout_keep_deep[0])
for i in range(len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights['deep_layer_%d' % i]), self.weights['deep_bias_%d' % i])
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
# cross network
self._x0 = tf.reshape(self.x0,(-1,self.total_size,1))
x_l = self._x0
for l in range(self.cross_layer_num):
x_l = tf.tensordot(tf.matmul(self._x0,x_l, transpose_b=True),
self.weights['cross_layer_%d' % l],1) + self.weights['cross_bias_%d' % l] + x_l
self.cross_network_out = tf.reshape(x_l,(-1,self.total_size))
# concat layer
concat_input = tf.concat([self.cross_network_out, self.y_deep], axis=1)
self.out = tf.add(tf.matmul(concat_input,self.weights['concat_projection']),self.weights['concat_bias'])
# loss
if self.loss_type == 'logloss':
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label,self.out)
elif self.loss_type == 'rmse':
self.loss = tf.sqrt(tf.losses.mean_squared_error(self.label,self.out))
# l2_reg
if self.l2_reg > 0:
self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights['concat_projection'])
for i in range(len(self.deep_layers)):
self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights['deep_layer_%d' % i])
for i in range(len(self.cross_layer_num)):
self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights['cross_layer_%d' % i])
# optimization
if self.optimizer_type == 'adam':
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,beta1=0.9,beta2=0.999,epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == 'adagrad':
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == 'gd':
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == 'momentum':
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)
# init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
# number of params
total_parameters = 0
for v in self.weights.values():
shape = v.get_shape()
value_params = 1
for dim in shape:
value_params *= dim.value
total_parameters += value_params
if self.verbose > 0:
print('Parames: %d' % total_parameters)
def _initialize_weights(self):
weights = dict()
#embedding
weights['feature_embeddings'] = tf.Variable(tf.random_normal([self.cate_feature_size,self.embedding_size],0.0,0.01),name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal([self.cate_feature_size,1],0.0,1.0),name='feature_bias')
# deep network
num_layer = len(self.deep_layers)
glorot = np.sqrt(2.0 / (self.total_size + self.deep_layers[0]))
weights['deep_layer_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.total_size,self.deep_layers[0])),dtype=np.float32)
weights['deep_bias_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32)
for i in range(1, num_layer):
glorot = np.sqrt(2.0 / (self.total_size + self.deep_layers[i]))
# size = layers[i-1] * layers[i]
weights['deep_layer_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.deep_layers[i-1],self.deep_layers[i])),dtype=np.float32)
#size = 1 * layers[i]
weights['deep_bias_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[i])),dtype=np.float32)
# cross network
for i in range(self.cross_layer_num):
weights['cross_layer_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.total_size,1)), dtype=np.float32)
weights['cross_bias_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.total_size,1)), dtype=np.float32)
# Concat layers
input_size = self.total_size + self.deep_layers[-1]
glorot = np.sqrt(2.0 / (input_size + 1))
weights['concat_projection'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,1)),dtype=np.float32)
weights['concat_bias'] = tf.Variable(tf.constant(0.01), dtype=np.float32)
return weights
def get_batch(self,Xi,Xv,Xv2,y,batch_size, index):
start = index * batch_size
end = (index + 1) * batch_size
end = end if end < len(y) else len(y)
return Xi[start:end], Xv[start:end],Xv2[start:end],[[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self,a,b,c,d):
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
np.random.set_state(rng_state)
np.random.shuffle(c)
np.random.set_state(rng_state)
np.random.shuffle(d)
def predict(self,Xi,Xv,Xv2,y):
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.numeric_value: Xv2,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_deep),
self.train_phase: True}
loss = self.sess.run([self.loss], feed_dict=feed_dict)
return loss
def fit_on_batch(self,Xi,Xv,Xv2,y):
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.numeric_value: Xv2,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_deep),
self.train_phase: True}
loss, opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
return loss
def fit(self,cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train,
cate_Xi_valid=None,cate_Xv_valid=None,numeric_Xv_valid=None,y_valid=None,
early_stopping=False,refit=False):
"""
:Xi_train: feature index of feature field of sample in the training set
:Xv_train: feature value of feature field of sample in the training set; can be either binary or float
:y_train: label of each sample in the training set
:Xi_valid: feature indices of each sample in the validation set
:Xv_valid: feature values of each sample in the validation set
:y_valid: label of each sample in the validation set
:early_stopping: early stopping or not
:refit: refit the model on the train+valid dataset or not
"""
print(len(cate_Xi_train))
print(len(cate_Xv_train))
print(len(numeric_Xv_train))
print(len(y_train))
has_valid = cate_Xv_valid is not None
for epoch in range(self.epoch):
t1 = time()
self.shuffle_in_unison_scary(cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train)
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
cate_Xi_batch, cate_Xv_batch, numeric_Xv_batch, y_batch = self.get_batch(cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train,self.batch_size,1)
self.fit_on_batch(cate_Xi_batch,cate_Xv_batch,numeric_Xv_batch,y_batch)
if has_valid:
y_valid = np.array(y_valid).reshape((-1,1))
loss = self.predict(cate_Xi_valid,cate_Xv_valid,numeric_Xv_valid,y_valid)
print('epoch: ',epoch, 'loss:',loss)
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册