未验证 提交 059cb010 编写于 作者: M miao18 提交者: GitHub

Merge pull request #1 from apachecn/master

Update
import numpy as np
import pandas as pd
import keras.backend as K
from keras import layers
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, Embedding, Reshape, Add
from keras.layers import Flatten, merge, Lambda
from keras.models import Model
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import random
def feature_generate(data):
data, label, cate_columns, cont_columns = preprocessing(data)
embeddings_tensors = []
continuous_tensors = []
for ec in cate_columns:
layer_name = ec + '_inp'
# For categorical features, we em-bed the features in dense vectors of dimension 6×(category cardinality)**(1/4)
embed_dim = data[ec].nunique() if int(6 * np.power(data[ec].nunique(), 1/4)) > data[ec].nunique() \
else int(6 * np.power(data[ec].nunique(), 1/4))
t_inp, t_build = embedding_input(layer_name, data[ec].nunique(), embed_dim)
embeddings_tensors.append((t_inp, t_build))
del(t_inp, t_build)
for cc in cont_columns:
layer_name = cc + '_in'
t_inp, t_build = continous_input(layer_name)
continuous_tensors.append((t_inp, t_build))
del(t_inp, t_build)
inp_layer = [et[0] for et in embeddings_tensors]
inp_layer += [ct[0] for ct in continuous_tensors]
inp_embed = [et[1] for et in embeddings_tensors]
inp_embed += [ct[1] for ct in continuous_tensors]
return data, label, inp_layer, inp_embed
def embedding_input(name, n_in, n_out):
inp = Input(shape = (1, ), dtype = 'int64', name = name)
return inp, Embedding(n_in, n_out, input_length = 1)(inp)
def continous_input(name):
inp = Input(shape=(1, ), dtype = 'float32', name = name)
return inp, Reshape((1, 1))(inp)
# The optimal hyperparameter settings were 8 cross layers of size 54 and 6 deep layers of size 292 for DCN
# Embed "Soil_Type" column (embedding dim == 15), we have 8 cross layers of size 29
def fit(inp_layer, inp_embed, X, y):
#inp_layer, inp_embed = feature_generate(X, cate_columns, cont_columns)
input = merge(inp_embed, mode = 'concat')
# deep layer
for i in range(6):
if i == 0:
deep = Dense(272, activation='relu')(Flatten()(input))
else:
deep = Dense(272, activation='relu')(deep)
# cross layer
cross = CrossLayer(output_dim = input.shape[2].value, num_layer = 8, name = "cross_layer")(input)
#concat both layers
output = merge([deep, cross], mode = 'concat')
output = Dense(y.shape[1], activation = 'softmax')(output)
model = Model(inp_layer, output)
print(model.summary())
plot_model(model, to_file = 'model.png', show_shapes = True)
model.compile(Adam(0.01), loss = 'categorical_crossentropy', metrics = ["accuracy"])
model.fit([X[c] for c in X.columns], y, batch_size = 256, epochs = 10)
return model
def evaluate(X, y, model):
y_pred = model.predict([X[c] for c in X.columns])
acc = np.sum(np.argmax(y_pred, 1) == np.argmax(y, 1)) / y.shape[0]
print("Accuracy: ", acc)
# https://keras.io/layers/writing-your-own-keras-layers/
class CrossLayer(layers.Layer):
def __init__(self, output_dim, num_layer, **kwargs):
self.output_dim = output_dim
self.num_layer = num_layer
super(CrossLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.input_dim = input_shape[2]
self.W = []
self.bias = []
for i in range(self.num_layer):
self.W.append(self.add_weight(shape = [1, self.input_dim], initializer = 'glorot_uniform', name = 'w_' + str(i), trainable = True))
self.bias.append(self.add_weight(shape = [1, self.input_dim], initializer = 'zeros', name = 'b_' + str(i), trainable = True))
self.built = True
def call(self, input):
for i in range(self.num_layer):
if i == 0:
cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), x), 1, keepdims = True), self.bias[i], x]))(input)
else:
cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), input), 1, keepdims = True), self.bias[i], input]))(cross)
return Flatten()(cross)
def compute_output_shape(self, input_shape):
return (None, self.output_dim)
# modify the embedding columns here
def preprocessing(data):
# inverse transform one-hot to continuous column
df_onehot = data[[col for col in data.columns.tolist() if "Soil_Type" in col]]
#for i in df_onehot.columns.tolist():
# if df_onehot[i].sum() == 0:
# del df_onehot[i]
data["Soil"] = df_onehot.dot(np.array(range(df_onehot.columns.size))).astype(int)
data.drop([col for col in data.columns.tolist() if "Soil_Type" in col], axis = 1, inplace = True)
label = np.array(OneHotEncoder().fit_transform(data["Cover_Type"].reshape(-1, 1)).todense())
del data["Cover_Type"]
cate_columns = ["Soil"]
cont_columns = [col for col in data.columns if col != "Soil"]
# Feature normilization
scaler = StandardScaler()
data_cont = pd.DataFrame(scaler.fit_transform(data[cont_columns]), columns = cont_columns)
data_cate = data[cate_columns]
data = pd.concat([data_cate, data_cont], axis = 1)
return data, label, cate_columns, cont_columns
if __name__ == "__main__":
#data from kaggle forest cover type dataset
data = pd.read_csv("data/covtype.csv")
X, y, inp_layer, inp_embed = feature_generate(data)
#random split train and test by 9:1
train_index = random.sample(range(X.shape[0]), int(X.shape[0] * 0.9))
test_index = list(set(range(X.shape[0])) - set(train_index))
model = fit(inp_layer, inp_embed, X.iloc[train_index], y[train_index, :])
evaluate(X.iloc[test_index], y[test_index, :], model)
## DCN(Deep and Cross network) demo 实现
Note: https://kaiyuanyokii2n.com/DCN.html
\ No newline at end of file
此差异已折叠。
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Factorization Machines with tf\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 根据user/items的id,建立稀疏矩阵\n",
"参考:https://gist.github.com/babakx/7a3fc9739b7778f6673a458605e18963"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from itertools import count\n",
"from collections import defaultdict\n",
"from scipy.sparse import csr\n",
"import numpy as np\n",
"\n",
"def vectorize_dic(dic,ix=None,p=None,n=0,g=0):\n",
" # dic -- dictionary of feature lists. Keys are the name of features\n",
" # ix -- index generator (default None)\n",
" # p -- dimension of feature space (number of columns in the sparse matrix) (default None)\n",
" # n -- num sample\n",
" # g -- num group: eg: uese/items---> g=2\n",
" \n",
" if ix==None:\n",
" ix = dict()\n",
" \n",
" \n",
" nz = n * g # number of non-zores\n",
"\n",
" col_ix = np.empty(nz,dtype = int)\n",
"\n",
" i = 0\n",
" for k,lis in dic.items():\n",
" for t in range(len(lis)):\n",
" ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1\n",
" # 附加索引'l'以防止将具有相同id的不同列映射到同一个索引\n",
" col_ix[i+t*g] = ix[str(lis[t]) + str(k)]\n",
" i += 1\n",
"\n",
" row_ix = np.repeat(np.arange(0,n),g)\n",
" data = np.ones(nz)\n",
" if p == None:\n",
" p = len(ix)\n",
"\n",
" ixx = np.where(col_ix < p)\n",
" return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loading data\n",
"使用MovieLens100k的数据,将数据转化成稀疏矩阵"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"\n",
"cols = ['user','item','rating','timestamp']\n",
"train = pd.read_csv('data/ua.base',delimiter='\\t',names=cols)\n",
"test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n",
"\n",
"x_train = vectorize_dic({'users':train['user'].values, 'items':train['item'].values},n=len(train.index),g=2)\n",
"x_test= vectorize_dic({'users':test['user'].values,'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)\n",
"\n",
"y_train = train.rating.values\n",
"y_test = test.rating.values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Input To Dense\n",
"把输入的x_train和x_test转化成dense格式,使其能被tf使用。"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(90570, 2623) (9430, 2623)\n"
]
}
],
"source": [
"x_train = x_train.todense()\n",
"x_test = x_test.todense()\n",
"\n",
"print(x_train.shape, x_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 用tensorflow定义FM模型"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# 初始化参数\n",
"import tensorflow as tf\n",
"\n",
"n,p = x_train.shape\n",
"# number 0f latent factor\n",
"k = 10\n",
"\n",
"x = tf.placeholder('float',[None,p])\n",
"y = tf.placeholder('float',[None,1])\n",
"\n",
"# bias and weight\n",
"w0 = tf.Variable(tf.zeros([1]))\n",
"w = tf.Variable(tf.zeros([p]))\n",
"\n",
"#interaction factors\n",
"v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))\n",
"\n",
"y_hat = tf.Variable(tf.zeros([n, 1]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 定义输出y的计算公式\n",
"$$ \\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From <ipython-input-19-174f8e1533a4>:2: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"keep_dims is deprecated, use keepdims instead\n"
]
}
],
"source": [
"# 计算FM公式的输出\n",
"linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True))\n",
"pair_interactions = 0.5 * tf.reduce_sum(\n",
" tf.subtract(\n",
" tf.pow(tf.matmul(x,tf.transpose(v)),2),\n",
" tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))),axis=1, keep_dims=True)\n",
"y_hat = tf.add(linear_terms, pair_interactions)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loss function\n",
"$$ L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# L2 reg sum of squares of loss function\n",
"lambda_w = tf.constant(0.001, name='lambda_w')\n",
"lambda_v = tf.constant(0.001, name='lambda_v')\n",
"\n",
"l2_norm = tf.reduce_sum(\n",
" tf.add(\n",
" tf.multiply(lambda_w, tf.pow(w,2)),\n",
" tf.multiply(lambda_v, tf.pow(v,2))))\n",
"error = tf.reduce_mean(tf.square(tf.subtract(y,y_hat)))\n",
"loss = tf.add(error,l2_norm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Optimization\n",
"用SGD进行优化: $\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Mini-batcher"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def batcher(X_,y_=None,batch_size=-1):\n",
" n_samples = X_.shape[0]\n",
" if batch_size == -1:\n",
" batch_size = n_samples\n",
" if batch_size < 1:\n",
" raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n",
" \n",
" for i in range(0,n_samples,batch_size):\n",
" upper_bound = min(i + batch_size,n_samples)\n",
" ret_x = X_[i:upper_bound]\n",
" ret_y = None\n",
" if y_ is not None:\n",
" ret_y = y_[i:i + batch_size]\n",
" yield (ret_x,ret_y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tensorflow graph and traing"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6ccb183e948f4aa88f20186a0a31ffe8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"from tqdm import tqdm_notebook as tqdm\n",
"\n",
"epochs = 10\n",
"batch_size = 1000\n",
"\n",
"# tf graph\n",
"init = tf.global_variables_initializer()\n",
"sess = tf.Session()\n",
"\n",
"sess.run(init)\n",
"\n",
"for epochs in tqdm(range(epochs),unit='epoch'):\n",
" perm = np.random.permutation(x_train.shape[0])\n",
" # iterate over batches\n",
" for bX,bY in batcher(x_train[perm],y_train[perm],batch_size):\n",
" sess.run(optimizer, feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 评价模型\n",
"用RMSE评价"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1.1257708]\n"
]
}
],
"source": [
"errors = []\n",
"for bX,bY in batcher(x_test,y_test):\n",
" errors.append(sess.run(error,feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)}))\n",
"\n",
"RMSE = np.sqrt(np.array(errors))\n",
"print(RMSE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
## FM(Factorization Machine) demo 实现
Note: https://kaiyuanyokii2n.com/FM.html#more
此差异已折叠。
此差异已折叠。
## PNN(Product-base neural network) demo 实现
https://kaiyuanyokii2n.com/PNN.html
TRAIN_FILE = "data/train.csv"
TEST_FILE = "data/test.csv"
SUB_DIR = "output"
NUM_SPLITS = 3
RANDOM_SEED = 2017
# types of columns of the dataset dataframe
CATEGORICAL_COLS = [
# 'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
# 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
# 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
# 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
# 'ps_car_10_cat', 'ps_car_11_cat',
]
NUMERIC_COLS = [
# # binary
# "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
# "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
# "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
# "ps_ind_17_bin", "ps_ind_18_bin",
# "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
# "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
# numeric
"ps_reg_01", "ps_reg_02", "ps_reg_03",
"ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# feature engineering
"missing_feat", "ps_car_13_x_ps_reg_03",
]
IGNORE_COLS = [
"id", "target",
"ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
"ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
"ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
"ps_calc_13", "ps_calc_14",
"ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
"ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]
因为 它太大了无法显示 source diff 。你可以改为 查看blob
此差异已折叠。
import pandas as pd
class FeatureDictionary(object):
def __init__(self,trainfile=None,testfile=None,
dfTrain=None,dfTest=None,numeric_cols=[],
ignore_cols=[]):
assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
self.trainfile = trainfile
self.testfile = testfile
self.dfTrain = dfTrain
self.dfTest = dfTest
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
self.gen_feat_dict()
def gen_feat_dict(self):
if self.dfTrain is None:
dfTrain = pd.read_csv(self.trainfile)
else:
dfTrain = self.dfTrain
if self.dfTest is None:
dfTest = pd.read_csv(self.testfile)
else:
dfTest = self.dfTest
df = pd.concat([dfTrain,dfTest])
self.feat_dict = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols:
continue
if col in self.numeric_cols:
self.feat_dict[col] = tc
tc += 1
else:
us = df[col].unique()
print(us)
self.feat_dict[col] = dict(zip(us,range(tc,len(us)+tc)))
tc += len(us)
self.feat_dim = tc
class DataParser(object):
def __init__(self,feat_dict):
self.feat_dict = feat_dict
def parse(self,infile=None,df=None,has_label=False):
assert not ((infile is None) and (df is None)), "infile or df at least one is set"
assert not ((infile is not None) and (df is not None)), "only one can be set"
if infile is None:
dfi = df.copy()
else:
dfi = pd.read_csv(infile)
if has_label:
y = dfi['target'].values.tolist()
dfi.drop(['id','target'],axis=1,inplace=True)
else:
ids = dfi['id'].values.tolist()
dfi.drop(['id'],axis=1,inplace=True)
# dfi for feature index
# dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
dfv = dfi.copy()
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col,axis=1,inplace=True)
dfv.drop(col,axis=1,inplace=True)
continue
if col in self.feat_dict.numeric_cols:
dfi[col] = self.feat_dict.feat_dict[col]
else:
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
xi = dfi.values.tolist()
xv = dfv.values.tolist()
if has_label:
return xi,xv,y
else:
return xi,xv,ids
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from DataReader import FeatureDictionary, DataParser
from matplotlib import pyplot as plt
import config
from model import PNN
def load_data():
dfTrain = pd.read_csv(config.TRAIN_FILE)
dfTest = pd.read_csv(config.TEST_FILE)
def preprocess(df):
cols = [c for c in df.columns if c not in ['id','target']]
#df['missing_feat'] = np.sum(df[df[cols]==-1].values,axis=1)
df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
return df
dfTrain = preprocess(dfTrain)
dfTest = preprocess(dfTest)
cols = [c for c in dfTrain.columns if c not in ['id','target']]
cols = [c for c in cols if (not c in config.IGNORE_COLS)]
X_train = dfTrain[cols].values
y_train = dfTrain['target'].values
X_test = dfTest[cols].values
ids_test = dfTest['id'].values
cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]
return dfTrain,dfTest,X_train,y_train,X_test,ids_test,cat_features_indices
def run_base_model_pnn(dfTrain,dfTest,folds,pnn_params):
fd = FeatureDictionary(dfTrain=dfTrain,
dfTest=dfTest,
numeric_cols=config.NUMERIC_COLS,
ignore_cols = config.IGNORE_COLS)
data_parser = DataParser(feat_dict= fd)
# Xi_train :列的序号
# Xv_train :列的对应的值
Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)
print(dfTrain.dtypes)
pnn_params['feature_size'] = fd.feat_dim
pnn_params['field_size'] = len(Xi_train[0])
_get = lambda x,l:[x[i] for i in l]
for i, (train_idx, valid_idx) in enumerate(folds):
Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
pnn = PNN(**pnn_params)
pnn.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
pnn_params = {
"embedding_size":8,
"deep_layers":[32,32],
"dropout_deep":[0.5,0.5,0.5],
"deep_layer_activation":tf.nn.relu,
"epoch":30,
"batch_size":1024,
"learning_rate":0.001,
"optimizer":"adam",
"batch_norm":1,
"batch_norm_decay":0.995,
"verbose":True,
"random_seed":config.RANDOM_SEED,
"deep_init_size":50,
"use_inner":False
}
# load data
dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = load_data()
# folds
folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
random_state=config.RANDOM_SEED).split(X_train, y_train))
#y_train_pnn,y_test_pnn = run_base_model_pnn(dfTrain,dfTest,folds,pnn_params)
y_train_pnn, y_test_pnn = run_base_model_pnn(dfTrain, dfTest, folds, pnn_params)
import numpy as np
import tensorflow as tf
from time import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score
class PNN(BaseEstimator, TransformerMixin):
def __init__(self, feature_size, field_size,
embedding_size=8,
deep_layers=[32, 32], deep_init_size = 50,
dropout_deep=[0.5, 0.5, 0.5],
deep_layer_activation=tf.nn.relu,
epoch=10, batch_size=256,
learning_rate=0.001, optimizer="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2016,
loss_type="logloss", eval_metric=roc_auc_score,
greater_is_better=True,
use_inner=True):
assert loss_type in ["logloss", "mse"], \
"loss_type can be either 'logloss' for classification task or 'mse' for regression task"
self.feature_size = feature_size
self.field_size = field_size
self.embedding_size = embedding_size
self.deep_layers = deep_layers
self.deep_init_size = deep_init_size
self.dropout_dep = dropout_deep
self.deep_layers_activation = deep_layer_activation
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.greater_is_better = greater_is_better
self.train_result,self.valid_result = [],[]
self.use_inner = use_inner
self._init_graph()
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
tf.set_random_seed(self.random_seed)
self.feat_index = tf.placeholder(tf.int32,
shape=[None,None],
name='feat_index')
self.feat_value = tf.placeholder(tf.float32,
shape=[None,None],
name='feat_value')
self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
self.train_phase = tf.placeholder(tf.bool,name='train_phase')
self.weights = self._initialize_weights()
# Embeddings
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
self.embeddings = tf.multiply(self.embeddings,feat_value) # N * F * K
# Linear Singal
linear_output = []
for i in range(self.deep_init_size):
linear_output.append(tf.reshape(
tf.reduce_sum(tf.multiply(self.embeddings,self.weights['product-linear'][i]),axis=[1,2]),shape=(-1,1)))# N * 1
self.lz = tf.concat(linear_output,axis=1) # N * init_deep_size
# Quardatic Singal
quadratic_output = []
if self.use_inner:
for i in range(self.deep_init_size):
theta = tf.multiply(self.embeddings,tf.reshape(self.weights['product-quadratic-inner'][i],(1,-1,1))) # N * F * K
quadratic_output.append(tf.reshape(tf.norm(tf.reduce_sum(theta,axis=1),axis=1),shape=(-1,1))) # N * 1
else:
embedding_sum = tf.reduce_sum(self.embeddings,axis=1)
p = tf.matmul(tf.expand_dims(embedding_sum,2),tf.expand_dims(embedding_sum,1)) # N * K * K
for i in range(self.deep_init_size):
theta = tf.multiply(p,tf.expand_dims(self.weights['product-quadratic-outer'][i],0)) # N * K * K
quadratic_output.append(tf.reshape(tf.reduce_sum(theta,axis=[1,2]),shape=(-1,1))) # N * 1
self.lp = tf.concat(quadratic_output,axis=1) # N * init_deep_size
self.y_deep = tf.nn.relu(tf.add(tf.add(self.lz, self.lp), self.weights['product-bias']))
self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
# Deep component
for i in range(0,len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["layer_%d" %i]), self.weights["bias_%d"%i])
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
self.out = tf.add(tf.matmul(self.y_deep,self.weights['output']),self.weights['output_bias'])
# loss
if self.loss_type == "logloss":
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse":
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
if self.optimizer_type == "adam":
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == "gd":
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == "momentum":
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
self.loss)
#init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
# number of params
total_parameters = 0
for variable in self.weights.values():
shape = variable.get_shape()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim.value
total_parameters += variable_parameters
if self.verbose > 0:
print("#params: %d" % total_parameters)
def _initialize_weights(self):
weights = dict()
#embeddings
weights['feature_embeddings'] = tf.Variable(
tf.random_normal([self.feature_size,self.embedding_size],0.0,0.01),
name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal([self.feature_size,1],0.0,1.0),name='feature_bias')
#Product Layers
if self.use_inner:
weights['product-quadratic-inner'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size],0.0,0.01))
else:
weights['product-quadratic-outer'] = tf.Variable(
tf.random_normal([self.deep_init_size, self.embedding_size,self.embedding_size], 0.0, 0.01))
weights['product-linear'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size,self.embedding_size],0.0,0.01))
weights['product-bias'] = tf.Variable(tf.random_normal([self.deep_init_size,],0,0,1.0))
#deep layers
num_layer = len(self.deep_layers)
input_size = self.deep_init_size
glorot = np.sqrt(2.0/(input_size + self.deep_layers[0]))
weights['layer_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(input_size,self.deep_layers[0])),dtype=np.float32
)
weights['bias_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32
)
for i in range(1,num_layer):
glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
weights["layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[i]
weights["bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
dtype=np.float32) # 1 * layer[i]
glorot = np.sqrt(2.0/(input_size + 1))
weights['output'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.deep_layers[-1],1)),dtype=np.float32)
weights['output_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32)
return weights
def get_batch(self,Xi,Xv,y,batch_size,index):
start = index * batch_size
end = (index + 1) * batch_size
end = end if end < len(y) else len(y)
return Xi[start:end],Xv[start:end],[[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self, a, b, c):
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
np.random.set_state(rng_state)
np.random.shuffle(c)
def predict(self, Xi, Xv,y):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:return: predicted probability of each sample
"""
# dummy y
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
self.train_phase: True}
loss = self.sess.run([self.loss], feed_dict=feed_dict)
return loss
def fit_on_batch(self,Xi,Xv,y):
feed_dict = {self.feat_index:Xi,
self.feat_value:Xv,
self.label:y,
self.dropout_keep_deep:self.dropout_dep,
self.train_phase:True}
loss,opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
return loss
def fit(self, Xi_train, Xv_train, y_train,
Xi_valid=None, Xv_valid=None, y_valid=None,
early_stopping=False, refit=False):
"""
:param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
indi_j is the feature index of feature field j of sample i in the training set
:param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
vali_j is the feature value of feature field j of sample i in the training set
vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
:param y_train: label of each sample in the training set
:param Xi_valid: list of list of feature indices of each sample in the validation set
:param Xv_valid: list of list of feature values of each sample in the validation set
:param y_valid: label of each sample in the validation set
:param early_stopping: perform early stopping or not
:param refit: refit the model on the train+valid dataset or not
:return: None
"""
has_valid = Xv_valid is not None
for epoch in range(self.epoch):
t1 = time()
self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
if has_valid:
y_valid = np.array(y_valid).reshape((-1,1))
loss = self.predict(Xi_valid, Xv_valid, y_valid)
print("epoch",epoch,"loss",loss)
\ No newline at end of file
## 实现一些推荐算法的模型demo
笔记可以看:https://kaiyuanyokii2n.com/
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册