提交 c115b4de 编写于 作者: O O2Dyokii


上级 8c0a08c5
Note: https://kaiyuanyokii2n.com/DCN.html
TRAIN_FILE = "data/train.csv"
TEST_FILE = "data/test.csv"
SUB_DIR = "output"
# types of columns of the dataset dataframe
'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat',
# # binary
# "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
# "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
# "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
# "ps_ind_17_bin", "ps_ind_18_bin",
# "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
# "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
# numeric
"ps_reg_01", "ps_reg_02", "ps_reg_03",
"ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# feature engineering
"missing_feat", "ps_car_13_x_ps_reg_03",
"id", "target",
"ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
"ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
"ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
"ps_calc_13", "ps_calc_14",
"ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
"ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
\ No newline at end of file
因为 它太大了无法显示 source diff 。你可以改为 查看blob
import pandas as pd
import numpy as np
class FeatureDict(object):
def __init__(self, trainfile=None, testfile=None,numeric_cols=[],ignore_cols=[],cate_cols=[]):
self.trainfile = trainfile
self.testfile = testfile
self.cate_cols = cate_cols
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
Generate categorical deature dict
ex: df[col1] = [3,4,1,0,2];df[col2] = [-1,2,7]
generated feat_dict = {'col1':{3:0,4:1,1:2,0:3,2:4},'col2':{-1:5,2:6,7:7}}
def gen_feat_dict(self):
df = pd.concat([self.trainfile,self.testfile])
self.feat_dict = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols or col in self.numeric_cols:
us = df[col].unique()
self.feat_dict[col] = dict(zip(us, range(tc, len(us) + tc)))
tc += len(us)
self.feat_dim = tc
class DataPaser(object):
def __init__(self,feat_dict):
self.feat_dict = feat_dict
def parse(self,df=None,has_label=False):
dfi = df.copy() # feature index
if has_label:
y = dfi['target'].values.tolist()
dfi.drop(['id','target'], axis=1, inplace=True)
ids = dfi['id'].values.tolist()
numeric_values = dfi[self.feat_dict.numeric_cols].values.tolist()
dfi.drop(self.feat_dict.numeric_cols, axis=1,inplace=True)
dfv = dfi.copy() # dfv for feature values which binary or float
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col, axis=1,inplace=True)
dfv.drop(col, axis=1,inplace=True)
# categories feature
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
cate_idx = dfi.values.tolist()
cate_values = dfv.values.tolist()
if has_label:
return cate_idx, cate_values, numeric_values, y
return cate_idx, cate_values, numeric_values, ids
\ No newline at end of file
"cells": [
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mllab/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
"source": [
"import tensorflow as tf\n",
"import pandas as pd\n",
"import numpy as np\n",
"import config\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from load_data import FeatureDict, DataPaser\n",
"from model import DCN"
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data"
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def load_data():\n",
" dfTrain = pd.read_csv('data/train.csv')\n",
" dfTest = pd.read_csv('data/test.csv')\n",
" \n",
" def preprocess(df):\n",
" cols = [c for c in df.columns if c not in ['id','target']]\n",
" df['missing_feat'] = np.sum((df[cols] == -1).values, axis=1)\n",
" df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']\n",
" return df\n",
" \n",
" dfTrain = preprocess(dfTrain)\n",
" dfTest = preprocess(dfTest)\n",
" \n",
" cols = [c for c in dfTrain.columns if c not in ['id','target']]\n",
" cols = [c for c in cols if (not c in config.IGNORE_COLS)]\n",
" \n",
" X_train = dfTrain[cols].values\n",
" y_train = dfTrain['target'].values\n",
" X_test = dfTest[cols].values\n",
" ids_test = dfTest['id'].values\n",
" \n",
" return dfTrain, dfTest, X_train, y_train,X_test,ids_test"
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run model"
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def run_dcn(dfTrain, dfTest, folds, params):\n",
" fd = FeatureDict(dfTrain, dfTest, numeric_cols=config.NUMERIC_COLS, \n",
" ignore_cols=config.IGNORE_COLS, cate_cols=config.CATEGORICAL_COLS)\n",
" # print(fd.feat_dim)\n",
" # print(fd.feat_dict)\n",
" \n",
" data_parser = DataPaser(feat_dict=fd)\n",
" cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)\n",
" cate_Xi_test,cate_Xv_test,numeric_Xv_test,ids_test = data_parser.parse(df=dfTest)\n",
" \n",
" params['cate_feature_size'] = fd.feat_dim\n",
" params['field_size'] = len(cate_Xi_train[0])\n",
" params['numeric_feature_size'] = len(config.NUMERIC_COLS)\n",
" \n",
" _get = lambda x, l: [x[i] for i in l]\n",
" for i,(trn_idx,val_idx) in enumerate(folds):\n",
" cate_Xi_train_, cate_Xv_train_,numeric_Xv_train_,y_train_ = _get(cate_Xi_train, trn_idx),_get(cate_Xv_train,trn_idx),_get(numeric_Xv_train,trn_idx),_get(y_train,trn_idx)\n",
" cate_Xi_valid_, cate_Xv_valid_,numeric_Xv_valid_,y_valid_ = _get(cate_Xi_train,val_idx),_get(cate_Xi_train,val_idx),_get(numeric_Xv_train,val_idx),_get(y_train,val_idx)\n",
" \n",
" dcn = DCN(**params)\n",
" dcn.fit(cate_Xi_train_,cate_Xv_train_,numeric_Xv_train_,y_train_,cate_Xi_valid_,cate_Xv_valid_,numeric_Xv_valid_,y_valid_)"
"cell_type": "markdown",
"metadata": {},
"source": [
"## Main"
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dfTrain,dfTest,X_train,y_train,X_test,ids_test = load_data()"
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS,shuffle=True, random_state=config.RANDOM_SEED).split(X_train,y_train))"
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"params = {'embedding_size': 4,\n",
" 'deep_layers': [8,8],\n",
" 'dropout_deep': [0.5,0.5,0.5],\n",
" 'deep_layers_activation': tf.nn.relu,\n",
" 'epoch': 30,\n",
" 'batch_size': 128,\n",
" 'learning_rate': 0.001,\n",
" 'optimizer_type': 'adam',\n",
" 'verbose': True,\n",
" 'random_seed': config.RANDOM_SEED,\n",
" 'cross_layer_num': 3}"
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": false
"outputs": [
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mllab/jupyter/Workspace/Kyuan/RS/DCN/load_data.py:20: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"To accept the future behavior, pass 'sort=False'.\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
" df = pd.concat([self.trainfile,self.testfile])\n"
"name": "stdout",
"output_type": "stream",
"text": [
"Parames: 3259\n",
"epoch: 0 loss: [12.024529]\n",
"epoch: 1 loss: [7.773973]\n",
"epoch: 2 loss: [2.8597765]\n",
"epoch: 3 loss: [1.4841796]\n",
"epoch: 4 loss: [1.1699396]\n",
"epoch: 5 loss: [1.6370908]\n",
"epoch: 6 loss: [1.8515997]\n",
"epoch: 7 loss: [2.2488427]\n",
"epoch: 8 loss: [2.4133043]\n",
"epoch: 9 loss: [3.5436616]\n",
"epoch: 10 loss: [3.5054557]\n",
"epoch: 11 loss: [2.6502101]\n",
"epoch: 12 loss: [2.3495483]\n",
"epoch: 13 loss: [2.6154437]\n",
"epoch: 14 loss: [1.1022573]\n",
"epoch: 15 loss: [2.9973662]\n",
"epoch: 16 loss: [3.8579004]\n",
"epoch: 17 loss: [6.178442]\n",
"epoch: 18 loss: [8.025207]\n",
"epoch: 19 loss: [7.136653]\n",
"epoch: 20 loss: [7.3870573]\n",
"epoch: 21 loss: [8.595674]\n",
"epoch: 22 loss: [7.3435473]\n",
"epoch: 23 loss: [6.7827497]\n",
"epoch: 24 loss: [4.636249]\n",
"epoch: 25 loss: [3.5871704]\n",
"epoch: 26 loss: [2.8329947]\n",
"epoch: 27 loss: [4.4573736]\n",
"epoch: 28 loss: [4.7281036]\n",
"epoch: 29 loss: [4.7764482]\n",
"Parames: 3259\n",
"epoch: 0 loss: [0.6093248]\n",
"epoch: 1 loss: [1.5039687]\n",
"epoch: 2 loss: [6.712246]\n",
"epoch: 3 loss: [7.0024014]\n",
"epoch: 4 loss: [6.4801226]\n",
"epoch: 5 loss: [3.2884202]\n",
"epoch: 6 loss: [3.6269343]\n",
"epoch: 7 loss: [1.3347118]\n",
"epoch: 8 loss: [1.0590647]\n",
"epoch: 9 loss: [1.1074238]\n",
"epoch: 10 loss: [1.8569903]\n",
"epoch: 11 loss: [1.6635537]\n",
"epoch: 12 loss: [2.031083]\n",
"epoch: 13 loss: [2.0649345]\n",
"epoch: 14 loss: [3.854222]\n",
"epoch: 15 loss: [2.8241727]\n",
"epoch: 16 loss: [4.5554295]\n",
"epoch: 17 loss: [3.8784018]\n",
"epoch: 18 loss: [3.9847918]\n",
"epoch: 19 loss: [6.2238193]\n",
"epoch: 20 loss: [5.2663083]\n",
"epoch: 21 loss: [7.23936]\n",
"epoch: 22 loss: [5.730556]\n",
"epoch: 23 loss: [6.973385]\n",
"epoch: 24 loss: [6.073906]\n",
"epoch: 25 loss: [6.378568]\n",
"epoch: 26 loss: [5.058364]\n",
"epoch: 27 loss: [3.7043087]\n",
"epoch: 28 loss: [5.9491067]\n",
"epoch: 29 loss: [5.677361]\n",
"Parames: 3259\n",
"epoch: 0 loss: [0.6093248]\n",
"epoch: 1 loss: [0.6189967]\n",
"epoch: 2 loss: [0.6625199]\n",
"epoch: 3 loss: [0.9236592]\n",
"epoch: 4 loss: [0.8559564]\n",
"epoch: 5 loss: [0.88980776]\n",
"epoch: 6 loss: [1.2686524]\n",
"epoch: 7 loss: [1.5201157]\n",
"epoch: 8 loss: [1.6151947]\n",
"epoch: 9 loss: [2.6549156]\n",
"epoch: 10 loss: [4.126936]\n",
"epoch: 11 loss: [3.530216]\n",
"epoch: 12 loss: [3.5785751]\n",
"epoch: 13 loss: [6.5671687]\n",
"epoch: 14 loss: [3.3609593]\n",
"epoch: 15 loss: [4.690835]\n",
"epoch: 16 loss: [5.375682]\n",
"epoch: 17 loss: [4.978845]\n",
"epoch: 18 loss: [8.299833]\n",
"epoch: 19 loss: [7.519844]\n",
"epoch: 20 loss: [9.057664]\n",
"epoch: 21 loss: [11.291857]\n",
"epoch: 22 loss: [11.814135]\n",
"epoch: 23 loss: [8.627268]\n",
"epoch: 24 loss: [10.576701]\n",
"epoch: 25 loss: [9.666995]\n",
"epoch: 26 loss: [6.6638875]\n",
"epoch: 27 loss: [8.0759735]\n",
"epoch: 28 loss: [6.2821198]\n",
"epoch: 29 loss: [9.70084]\n"
"source": [
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"nbformat": 4,
"nbformat_minor": 2
import tensorflow as tf
import numpy as np
from time import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score
class DCN(BaseEstimator, TransformerMixin):
def __init__(self, cate_feature_size,field_size,numeric_feature_size,embedding_size=8,
assert loss_type in ["logloss", "rmse"], \
"'logloss' for classification or 'rmse' for regression"
self.cate_feature_size = cate_feature_size
self.numeric_feature_size = numeric_feature_size
self.field_size = field_size
self.embedding_size = embedding_size
self.total_size = self.field_size * self.embedding_size + self.numeric_feature_size
self.deep_layers = deep_layers
self.cross_layer_num = cross_layer_num
self.dropout_deep = dropout_deep
self.deep_layers_activation = deep_layers_activation
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.train_result,self.valid_result = [],[]
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
self.feat_index = tf.placeholder(tf.int32, shape=[None,None], name='feat_index')
self.feat_value = tf.placeholder(tf.float32, shape=[None,None], name='feat_value')
self.numeric_value = tf.placeholder(tf.float32, shape=[None,None],name='num_value')
self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_keep_deep')
self.train_phase = tf.placeholder(tf.bool,name='train_phase')
self.weights = self._initialize_weights()
# model
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value, shape=[-1,self.field_size,1])
self.embeddings = tf.multiply(self.embeddings,feat_value)
self.x0 = tf.concat([self.numeric_value,tf.reshape(self.embeddings,shape=[-1,self.field_size * self.embedding_size])], axis=1)
# deep network
self.y_deep = tf.nn.dropout(self.x0,self.dropout_keep_deep[0])
for i in range(len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights['deep_layer_%d' % i]), self.weights['deep_bias_%d' % i])
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
# cross network
self._x0 = tf.reshape(self.x0,(-1,self.total_size,1))
x_l = self._x0
for l in range(self.cross_layer_num):
x_l = tf.tensordot(tf.matmul(self._x0,x_l, transpose_b=True),
self.weights['cross_layer_%d' % l],1) + self.weights['cross_bias_%d' % l] + x_l
self.cross_network_out = tf.reshape(x_l,(-1,self.total_size))
# concat layer
concat_input = tf.concat([self.cross_network_out, self.y_deep], axis=1)
self.out = tf.add(tf.matmul(concat_input,self.weights['concat_projection']),self.weights['concat_bias'])
# loss
if self.loss_type == 'logloss':
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label,self.out)
elif self.loss_type == 'rmse':
self.loss = tf.sqrt(tf.losses.mean_squared_error(self.label,self.out))
# l2_reg
if self.l2_reg > 0:
self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights['concat_projection'])
for i in range(len(self.deep_layers)):
self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights['deep_layer_%d' % i])
for i in range(len(self.cross_layer_num)):
self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights['cross_layer_%d' % i])
# optimization
if self.optimizer_type == 'adam':
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,beta1=0.9,beta2=0.999,epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == 'adagrad':
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == 'gd':
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == 'momentum':
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)
# init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = tf.Session()
# number of params
total_parameters = 0
for v in self.weights.values():
shape = v.get_shape()
value_params = 1
for dim in shape:
value_params *= dim.value
total_parameters += value_params
if self.verbose > 0:
print('Parames: %d' % total_parameters)
def _initialize_weights(self):
weights = dict()
weights['feature_embeddings'] = tf.Variable(tf.random_normal([self.cate_feature_size,self.embedding_size],0.0,0.01),name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal([self.cate_feature_size,1],0.0,1.0),name='feature_bias')
# deep network
num_layer = len(self.deep_layers)
glorot = np.sqrt(2.0 / (self.total_size + self.deep_layers[0]))
weights['deep_layer_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.total_size,self.deep_layers[0])),dtype=np.float32)
weights['deep_bias_0'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32)
for i in range(1, num_layer):
glorot = np.sqrt(2.0 / (self.total_size + self.deep_layers[i]))
# size = layers[i-1] * layers[i]
weights['deep_layer_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.deep_layers[i-1],self.deep_layers[i])),dtype=np.float32)
#size = 1 * layers[i]
weights['deep_bias_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[i])),dtype=np.float32)
# cross network
for i in range(self.cross_layer_num):
weights['cross_layer_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.total_size,1)), dtype=np.float32)
weights['cross_bias_%d' % i] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.total_size,1)), dtype=np.float32)
# Concat layers
input_size = self.total_size + self.deep_layers[-1]
glorot = np.sqrt(2.0 / (input_size + 1))
weights['concat_projection'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,1)),dtype=np.float32)
weights['concat_bias'] = tf.Variable(tf.constant(0.01), dtype=np.float32)
return weights
def get_batch(self,Xi,Xv,Xv2,y,batch_size, index):
start = index * batch_size
end = (index + 1) * batch_size
end = end if end < len(y) else len(y)
return Xi[start:end], Xv[start:end],Xv2[start:end],[[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self,a,b,c,d):
rng_state = np.random.get_state()
def predict(self,Xi,Xv,Xv2,y):
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.numeric_value: Xv2,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_deep),
self.train_phase: True}
loss = self.sess.run([self.loss], feed_dict=feed_dict)
return loss
def fit_on_batch(self,Xi,Xv,Xv2,y):
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.numeric_value: Xv2,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_deep),
self.train_phase: True}
loss, opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
return loss
def fit(self,cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train,
:Xi_train: feature index of feature field of sample in the training set
:Xv_train: feature value of feature field of sample in the training set; can be either binary or float
:y_train: label of each sample in the training set
:Xi_valid: feature indices of each sample in the validation set
:Xv_valid: feature values of each sample in the validation set
:y_valid: label of each sample in the validation set
:early_stopping: early stopping or not
:refit: refit the model on the train+valid dataset or not
has_valid = cate_Xv_valid is not None
for epoch in range(self.epoch):
t1 = time()
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
cate_Xi_batch, cate_Xv_batch, numeric_Xv_batch, y_batch = self.get_batch(cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train,self.batch_size,1)
if has_valid:
y_valid = np.array(y_valid).reshape((-1,1))
loss = self.predict(cate_Xi_valid,cate_Xv_valid,numeric_Xv_valid,y_valid)
print('epoch: ',epoch, 'loss:',loss)
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册