Merge pull request #1 from apachecn/master

Update

Merge pull request #1 from apachecn/master
Update
059cb010 · miao18 · GitHub · b069cead · 755e4480 · 059cb010
15 changed file
--- a/RS-tf/DCN/DCN.py
+++ b/RS-tf/DCN/DCN.py
+import numpy as np
+import pandas as pd
+import keras.backend as K
+from keras import layers
+from keras.layers import Dense
+from keras.optimizers import Adam
+from keras.layers import Input, Embedding, Reshape, Add
+from keras.layers import Flatten, merge, Lambda
+from keras.models import Model
+from keras.utils.vis_utils import plot_model
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.metrics import accuracy_score
+import random
+
+def feature_generate(data):
+    data, label, cate_columns, cont_columns = preprocessing(data)
+    embeddings_tensors = []
+    continuous_tensors = []
+    for ec in cate_columns:
+        layer_name = ec + '_inp'
+        # For categorical features, we em-bed the features in dense vectors of dimension 6×(category cardinality)**(1/4)
+        embed_dim = data[ec].nunique() if int(6 * np.power(data[ec].nunique(), 1/4)) > data[ec].nunique() \
+            else int(6 * np.power(data[ec].nunique(), 1/4))
+        t_inp, t_build = embedding_input(layer_name, data[ec].nunique(), embed_dim)
+        embeddings_tensors.append((t_inp, t_build))
+        del(t_inp, t_build)
+    for cc in cont_columns:
+        layer_name = cc + '_in'
+        t_inp, t_build = continous_input(layer_name)
+        continuous_tensors.append((t_inp, t_build))
+        del(t_inp, t_build)
+    inp_layer =  [et[0] for et in embeddings_tensors]
+    inp_layer += [ct[0] for ct in continuous_tensors]
+    inp_embed =  [et[1] for et in embeddings_tensors]
+    inp_embed += [ct[1] for ct in continuous_tensors]
+    return data, label, inp_layer, inp_embed
+
+def embedding_input(name, n_in, n_out):
+    inp = Input(shape = (1, ), dtype = 'int64', name = name)
+    return inp, Embedding(n_in, n_out, input_length = 1)(inp)
+
+def continous_input(name):
+    inp = Input(shape=(1, ), dtype = 'float32', name = name)
+    return inp, Reshape((1, 1))(inp)
+
+
+
+# The optimal hyperparameter settings were 8 cross layers of size 54 and 6 deep layers of size 292 for DCN
+# Embed "Soil_Type" column (embedding dim == 15), we have 8 cross layers of size 29   
+def fit(inp_layer, inp_embed, X, y):
+    #inp_layer, inp_embed = feature_generate(X, cate_columns, cont_columns)
+    input = merge(inp_embed, mode = 'concat')
+    # deep layer
+    for i in range(6):
+        if i == 0:
+            deep = Dense(272, activation='relu')(Flatten()(input))
+        else:
+            deep = Dense(272, activation='relu')(deep)
+
+    # cross layer
+    cross = CrossLayer(output_dim = input.shape[2].value, num_layer = 8, name = "cross_layer")(input)
+
+    #concat both layers
+    output = merge([deep, cross], mode = 'concat')
+    output = Dense(y.shape[1], activation = 'softmax')(output)
+    model = Model(inp_layer, output) 
+    print(model.summary())
+    plot_model(model, to_file = 'model.png', show_shapes = True)
+    model.compile(Adam(0.01), loss = 'categorical_crossentropy', metrics = ["accuracy"])
+    model.fit([X[c] for c in X.columns], y, batch_size = 256, epochs = 10)
+    return model
+
+
+def evaluate(X, y, model):
+    y_pred = model.predict([X[c] for c in X.columns])
+    acc = np.sum(np.argmax(y_pred, 1) == np.argmax(y, 1)) / y.shape[0]
+    print("Accuracy: ", acc)
+
+
+# https://keras.io/layers/writing-your-own-keras-layers/
+class CrossLayer(layers.Layer):
+    def __init__(self, output_dim, num_layer, **kwargs):
+        self.output_dim = output_dim
+        self.num_layer = num_layer
+        super(CrossLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.input_dim = input_shape[2]
+        self.W = []
+        self.bias = []
+        for i in range(self.num_layer):
+            self.W.append(self.add_weight(shape = [1, self.input_dim], initializer = 'glorot_uniform', name = 'w_' + str(i), trainable = True))
+            self.bias.append(self.add_weight(shape = [1, self.input_dim], initializer = 'zeros', name = 'b_' + str(i), trainable = True))
+        self.built = True
+
+    def call(self, input):
+        for i in range(self.num_layer):
+            if i == 0:
+                cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), x), 1, keepdims = True), self.bias[i], x]))(input)
+            else:
+                cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), input), 1, keepdims = True), self.bias[i], input]))(cross)
+        return Flatten()(cross)
+
+    def compute_output_shape(self, input_shape):
+        return (None, self.output_dim)
+
+
+
+# modify the embedding columns here
+def preprocessing(data):
+    # inverse transform one-hot to continuous column
+    df_onehot = data[[col for col in data.columns.tolist() if "Soil_Type" in col]]
+    #for i in df_onehot.columns.tolist():
+    #    if df_onehot[i].sum() == 0:
+    #        del df_onehot[i]
+    data["Soil"] = df_onehot.dot(np.array(range(df_onehot.columns.size))).astype(int)
+    data.drop([col for col in data.columns.tolist() if "Soil_Type" in col], axis = 1, inplace = True)
+    label = np.array(OneHotEncoder().fit_transform(data["Cover_Type"].reshape(-1, 1)).todense())
+    del data["Cover_Type"]
+    cate_columns = ["Soil"]
+    cont_columns = [col for col in data.columns if col != "Soil"]
+    # Feature normilization
+    scaler = StandardScaler()
+    data_cont = pd.DataFrame(scaler.fit_transform(data[cont_columns]), columns = cont_columns)
+    data_cate = data[cate_columns]
+    data = pd.concat([data_cate, data_cont], axis = 1)
+    return data, label, cate_columns, cont_columns
+
+
+
+if __name__ == "__main__":
+    #data from kaggle forest cover type dataset
+    data = pd.read_csv("data/covtype.csv")
+    X, y, inp_layer, inp_embed = feature_generate(data)
+    
+    #random split train and test by 9:1
+    train_index = random.sample(range(X.shape[0]), int(X.shape[0] * 0.9))
+    test_index = list(set(range(X.shape[0])) - set(train_index))
+
+    model = fit(inp_layer, inp_embed, X.iloc[train_index], y[train_index, :])
+    evaluate(X.iloc[test_index], y[test_index, :], model)
--- a/RS-tf/DCN/README.md
+++ b/RS-tf/DCN/README.md
+## DCN(Deep and Cross network) demo 实现
+
+Note： https://kaiyuanyokii2n.com/DCN.html
\ No newline at end of file
--- a/RS-tf/DCN/data/covtype.csv
+++ b/RS-tf/DCN/data/covtype.csv
--- a/RS-tf/FM/FM.ipynb
+++ b/RS-tf/FM/FM.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Factorization Machines with tf\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 根据user/items的id，建立稀疏矩阵\n",
+    "参考：https://gist.github.com/babakx/7a3fc9739b7778f6673a458605e18963"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import count\n",
+    "from collections import defaultdict\n",
+    "from scipy.sparse import csr\n",
+    "import numpy as np\n",
+    "\n",
+    "def vectorize_dic(dic,ix=None,p=None,n=0,g=0):\n",
+    "    # dic -- dictionary of feature lists. Keys are the name of features\n",
+    "    # ix -- index generator (default None)\n",
+    "    # p -- dimension of feature space (number of columns in the sparse matrix) (default None)\n",
+    "    # n -- num sample\n",
+    "    # g -- num group: eg: uese/items---> g=2\n",
+    "    \n",
+    "    if ix==None:\n",
+    "        ix = dict()\n",
+    "    \n",
+    "    \n",
+    "    nz = n * g # number of non-zores\n",
+    "\n",
+    "    col_ix = np.empty(nz,dtype = int)\n",
+    "\n",
+    "    i = 0\n",
+    "    for k,lis in dic.items():\n",
+    "        for t in range(len(lis)):\n",
+    "            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1\n",
+    "            # 附加索引'l'以防止将具有相同id的不同列映射到同一个索引\n",
+    "            col_ix[i+t*g] = ix[str(lis[t]) + str(k)]\n",
+    "        i += 1\n",
+    "\n",
+    "    row_ix = np.repeat(np.arange(0,n),g)\n",
+    "    data = np.ones(nz)\n",
+    "    if p == None:\n",
+    "        p = len(ix)\n",
+    "\n",
+    "    ixx = np.where(col_ix < p)\n",
+    "    return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading data\n",
+    "使用MovieLens100k的数据，将数据转化成稀疏矩阵"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction import DictVectorizer\n",
+    "\n",
+    "cols = ['user','item','rating','timestamp']\n",
+    "train = pd.read_csv('data/ua.base',delimiter='\\t',names=cols)\n",
+    "test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n",
+    "\n",
+    "x_train = vectorize_dic({'users':train['user'].values, 'items':train['item'].values},n=len(train.index),g=2)\n",
+    "x_test= vectorize_dic({'users':test['user'].values,'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)\n",
+    "\n",
+    "y_train = train.rating.values\n",
+    "y_test = test.rating.values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Input To Dense\n",
+    "把输入的x_train和x_test转化成dense格式，使其能被tf使用。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(90570, 2623) (9430, 2623)\n"
+     ]
+    }
+   ],
+   "source": [
+    "x_train = x_train.todense()\n",
+    "x_test = x_test.todense()\n",
+    "\n",
+    "print(x_train.shape, x_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 用tensorflow定义FM模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 初始化参数\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "n,p = x_train.shape\n",
+    "# number 0f latent factor\n",
+    "k = 10\n",
+    "\n",
+    "x = tf.placeholder('float',[None,p])\n",
+    "y = tf.placeholder('float',[None,1])\n",
+    "\n",
+    "# bias and weight\n",
+    "w0 = tf.Variable(tf.zeros([1]))\n",
+    "w = tf.Variable(tf.zeros([p]))\n",
+    "\n",
+    "#interaction factors\n",
+    "v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))\n",
+    "\n",
+    "y_hat = tf.Variable(tf.zeros([n, 1]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 定义输出y的计算公式\n",
+    "$$ \\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-19-174f8e1533a4>:2: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "keep_dims is deprecated, use keepdims instead\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 计算FM公式的输出\n",
+    "linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True))\n",
+    "pair_interactions = 0.5 * tf.reduce_sum(\n",
+    "                            tf.subtract(\n",
+    "                                tf.pow(tf.matmul(x,tf.transpose(v)),2),\n",
+    "                                        tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))),axis=1, keep_dims=True)\n",
+    "y_hat = tf.add(linear_terms, pair_interactions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loss function\n",
+    "$$ L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# L2 reg sum of squares of loss function\n",
+    "lambda_w = tf.constant(0.001, name='lambda_w')\n",
+    "lambda_v = tf.constant(0.001, name='lambda_v')\n",
+    "\n",
+    "l2_norm = tf.reduce_sum(\n",
+    "                    tf.add(\n",
+    "                        tf.multiply(lambda_w, tf.pow(w,2)),\n",
+    "                        tf.multiply(lambda_v, tf.pow(v,2))))\n",
+    "error = tf.reduce_mean(tf.square(tf.subtract(y,y_hat)))\n",
+    "loss = tf.add(error,l2_norm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Optimization\n",
+    "用SGD进行优化: $\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Mini-batcher"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def batcher(X_,y_=None,batch_size=-1):\n",
+    "    n_samples = X_.shape[0]\n",
+    "    if batch_size == -1:\n",
+    "        batch_size = n_samples\n",
+    "    if batch_size < 1:\n",
+    "        raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n",
+    "        \n",
+    "    for i in range(0,n_samples,batch_size):\n",
+    "        upper_bound = min(i + batch_size,n_samples)\n",
+    "        ret_x = X_[i:upper_bound]\n",
+    "        ret_y = None\n",
+    "        if y_ is not None:\n",
+    "            ret_y = y_[i:i + batch_size]\n",
+    "            yield (ret_x,ret_y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tensorflow graph and traing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6ccb183e948f4aa88f20186a0a31ffe8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "\n",
+    "epochs = 10\n",
+    "batch_size = 1000\n",
+    "\n",
+    "# tf graph\n",
+    "init = tf.global_variables_initializer()\n",
+    "sess = tf.Session()\n",
+    "\n",
+    "sess.run(init)\n",
+    "\n",
+    "for epochs in tqdm(range(epochs),unit='epoch'):\n",
+    "    perm = np.random.permutation(x_train.shape[0])\n",
+    "    # iterate over batches\n",
+    "    for bX,bY in batcher(x_train[perm],y_train[perm],batch_size):\n",
+    "        sess.run(optimizer, feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 评价模型\n",
+    "用RMSE评价"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1.1257708]\n"
+     ]
+    }
+   ],
+   "source": [
+    "errors = []\n",
+    "for bX,bY in batcher(x_test,y_test):\n",
+    "    errors.append(sess.run(error,feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)}))\n",
+    "\n",
+    "RMSE = np.sqrt(np.array(errors))\n",
+    "print(RMSE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/RS-tf/FM/README.md
+++ b/RS-tf/FM/README.md
+## FM(Factorization Machine) demo 实现
+Note： https://kaiyuanyokii2n.com/FM.html#more
--- a/RS-tf/FM/data/ua.base
+++ b/RS-tf/FM/data/ua.base
--- a/RS-tf/FM/data/ua.test
+++ b/RS-tf/FM/data/ua.test
--- a/RS-tf/PNN/README.md
+++ b/RS-tf/PNN/README.md
+## PNN(Product-base neural network) demo 实现
+https://kaiyuanyokii2n.com/PNN.html
--- a/RS-tf/PNN/config.py
+++ b/RS-tf/PNN/config.py
+TRAIN_FILE = "data/train.csv"
+TEST_FILE = "data/test.csv"
+
+SUB_DIR = "output"
+
+
+NUM_SPLITS = 3
+RANDOM_SEED = 2017
+
+# types of columns of the dataset dataframe
+CATEGORICAL_COLS = [
+    # 'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
+    # 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
+    # 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
+    # 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
+    # 'ps_car_10_cat', 'ps_car_11_cat',
+]
+
+NUMERIC_COLS = [
+    # # binary
+    # "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
+    # "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
+    # "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
+    # "ps_ind_17_bin", "ps_ind_18_bin",
+    # "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
+    # "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
+    # numeric
+    "ps_reg_01", "ps_reg_02", "ps_reg_03",
+    "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
+
+    # feature engineering
+    "missing_feat", "ps_car_13_x_ps_reg_03",
+]
+
+IGNORE_COLS = [
+    "id", "target",
+    "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
+    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
+    "ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
+    "ps_calc_13", "ps_calc_14",
+    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
+    "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
+]
--- a/RS-tf/PNN/data/test.csv
+++ b/RS-tf/PNN/data/test.csv
--- a/RS-tf/PNN/data/train.csv
+++ b/RS-tf/PNN/data/train.csv
--- a/RS-tf/PNN/load_data.py
+++ b/RS-tf/PNN/load_data.py
+import pandas as pd
+
+class FeatureDictionary(object):
+    def __init__(self,trainfile=None,testfile=None,
+                 dfTrain=None,dfTest=None,numeric_cols=[],
+                 ignore_cols=[]):
+        assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
+        assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
+        assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
+        assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
+
+        self.trainfile = trainfile
+        self.testfile = testfile
+        self.dfTrain = dfTrain
+        self.dfTest = dfTest
+        self.numeric_cols = numeric_cols
+        self.ignore_cols = ignore_cols
+        self.gen_feat_dict()
+
+
+
+
+    def gen_feat_dict(self):
+        if self.dfTrain is None:
+            dfTrain = pd.read_csv(self.trainfile)
+
+        else:
+            dfTrain = self.dfTrain
+
+        if self.dfTest is None:
+            dfTest = pd.read_csv(self.testfile)
+
+        else:
+            dfTest = self.dfTest
+
+        df = pd.concat([dfTrain,dfTest])
+
+        self.feat_dict = {}
+        tc = 0
+        for col in df.columns:
+            if col in self.ignore_cols:
+                continue
+            if col in self.numeric_cols:
+                self.feat_dict[col] = tc
+                tc += 1
+
+            else:
+                us = df[col].unique()
+                print(us)
+                self.feat_dict[col] = dict(zip(us,range(tc,len(us)+tc)))
+                tc += len(us)
+
+        self.feat_dim = tc
+
+
+class DataParser(object):
+    def __init__(self,feat_dict):
+        self.feat_dict = feat_dict
+
+    def parse(self,infile=None,df=None,has_label=False):
+        assert not ((infile is None) and (df is None)), "infile or df at least one is set"
+        assert not ((infile is not None) and (df is not None)), "only one can be set"
+
+
+        if infile is None:
+            dfi = df.copy()
+        else:
+            dfi = pd.read_csv(infile)
+
+        if has_label:
+            y = dfi['target'].values.tolist()
+            dfi.drop(['id','target'],axis=1,inplace=True)
+        else:
+            ids = dfi['id'].values.tolist()
+            dfi.drop(['id'],axis=1,inplace=True)
+        # dfi for feature index
+        # dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
+        dfv = dfi.copy()
+        for col in dfi.columns:
+            if col in self.feat_dict.ignore_cols:
+                dfi.drop(col,axis=1,inplace=True)
+                dfv.drop(col,axis=1,inplace=True)
+                continue
+            if col in self.feat_dict.numeric_cols:
+                dfi[col] = self.feat_dict.feat_dict[col]
+            else:
+                dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
+                dfv[col] = 1.
+
+        xi = dfi.values.tolist()
+        xv = dfv.values.tolist()
+
+        if has_label:
+            return xi,xv,y
+        else:
+            return xi,xv,ids
--- a/RS-tf/PNN/main.py
+++ b/RS-tf/PNN/main.py
+import os
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import StratifiedKFold
+from DataReader import FeatureDictionary, DataParser
+from matplotlib import pyplot as plt
+
+import config
+from model import PNN
+
+def load_data():
+    dfTrain = pd.read_csv(config.TRAIN_FILE)
+    dfTest = pd.read_csv(config.TEST_FILE)
+
+    def preprocess(df):
+        cols = [c for c in df.columns if c not in ['id','target']]
+        #df['missing_feat'] = np.sum(df[df[cols]==-1].values,axis=1)
+        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
+        df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
+        return df
+
+    dfTrain = preprocess(dfTrain)
+    dfTest = preprocess(dfTest)
+
+    cols = [c for c in dfTrain.columns if c not in ['id','target']]
+    cols = [c for c in cols if (not c in config.IGNORE_COLS)]
+
+    X_train = dfTrain[cols].values
+    y_train = dfTrain['target'].values
+
+    X_test = dfTest[cols].values
+    ids_test = dfTest['id'].values
+
+    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]
+
+    return dfTrain,dfTest,X_train,y_train,X_test,ids_test,cat_features_indices
+
+def run_base_model_pnn(dfTrain,dfTest,folds,pnn_params):
+    fd = FeatureDictionary(dfTrain=dfTrain,
+                           dfTest=dfTest,
+                           numeric_cols=config.NUMERIC_COLS,
+                           ignore_cols = config.IGNORE_COLS)
+    data_parser = DataParser(feat_dict= fd)
+    # Xi_train ：列的序号
+    # Xv_train ：列的对应的值
+    Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
+    Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)
+
+    print(dfTrain.dtypes)
+
+    pnn_params['feature_size'] = fd.feat_dim
+    pnn_params['field_size'] = len(Xi_train[0])
+
+
+    _get = lambda x,l:[x[i] for i in l]
+
+
+
+    for i, (train_idx, valid_idx) in enumerate(folds):
+        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
+        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
+
+        pnn = PNN(**pnn_params)
+        pnn.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
+
+
+
+
+
+
+
+pnn_params = {
+    "embedding_size":8,
+    "deep_layers":[32,32],
+    "dropout_deep":[0.5,0.5,0.5],
+    "deep_layer_activation":tf.nn.relu,
+    "epoch":30,
+    "batch_size":1024,
+    "learning_rate":0.001,
+    "optimizer":"adam",
+    "batch_norm":1,
+    "batch_norm_decay":0.995,
+    "verbose":True,
+    "random_seed":config.RANDOM_SEED,
+    "deep_init_size":50,
+    "use_inner":False
+
+}
+
+# load data
+dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = load_data()
+
+# folds
+folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
+                             random_state=config.RANDOM_SEED).split(X_train, y_train))
+
+#y_train_pnn,y_test_pnn = run_base_model_pnn(dfTrain,dfTest,folds,pnn_params)
+y_train_pnn, y_test_pnn = run_base_model_pnn(dfTrain, dfTest, folds, pnn_params)
--- a/RS-tf/PNN/model.py
+++ b/RS-tf/PNN/model.py
+import numpy as np
+import tensorflow as tf
+
+from time import time
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.metrics import roc_auc_score
+
+class PNN(BaseEstimator, TransformerMixin):
+
+    def __init__(self, feature_size, field_size,
+                 embedding_size=8,
+                 deep_layers=[32, 32], deep_init_size = 50,
+                 dropout_deep=[0.5, 0.5, 0.5],
+                 deep_layer_activation=tf.nn.relu,
+                 epoch=10, batch_size=256,
+                 learning_rate=0.001, optimizer="adam",
+                 batch_norm=0, batch_norm_decay=0.995,
+                 verbose=False, random_seed=2016,
+                 loss_type="logloss", eval_metric=roc_auc_score,
+                greater_is_better=True,
+                 use_inner=True):
+        assert loss_type in ["logloss", "mse"], \
+            "loss_type can be either 'logloss' for classification task or 'mse' for regression task"
+
+        self.feature_size = feature_size
+        self.field_size = field_size
+        self.embedding_size = embedding_size
+
+        self.deep_layers = deep_layers
+        self.deep_init_size = deep_init_size
+        self.dropout_dep = dropout_deep
+        self.deep_layers_activation = deep_layer_activation
+
+        self.epoch = epoch
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.optimizer_type = optimizer
+
+        self.batch_norm = batch_norm
+        self.batch_norm_decay = batch_norm_decay
+
+        self.verbose = verbose
+        self.random_seed = random_seed
+        self.loss_type = loss_type
+        self.eval_metric = eval_metric
+        self.greater_is_better = greater_is_better
+        self.train_result,self.valid_result = [],[]
+
+        self.use_inner = use_inner
+
+        self._init_graph()
+
+    def _init_graph(self):
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            tf.set_random_seed(self.random_seed)
+
+            self.feat_index = tf.placeholder(tf.int32,
+                                             shape=[None,None],
+                                             name='feat_index')
+            self.feat_value = tf.placeholder(tf.float32,
+                                           shape=[None,None],
+                                           name='feat_value')
+
+            self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
+            self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
+            self.train_phase = tf.placeholder(tf.bool,name='train_phase')
+
+            self.weights = self._initialize_weights()
+
+            # Embeddings
+            self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
+            feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
+            self.embeddings = tf.multiply(self.embeddings,feat_value) # N * F * K
+
+
+            # Linear Singal
+            linear_output = []
+            for i in range(self.deep_init_size):
+                linear_output.append(tf.reshape(
+                    tf.reduce_sum(tf.multiply(self.embeddings,self.weights['product-linear'][i]),axis=[1,2]),shape=(-1,1)))# N * 1
+
+            self.lz = tf.concat(linear_output,axis=1) # N * init_deep_size
+
+            # Quardatic Singal
+            quadratic_output = []
+            if self.use_inner:
+                for i in range(self.deep_init_size):
+                    theta = tf.multiply(self.embeddings,tf.reshape(self.weights['product-quadratic-inner'][i],(1,-1,1))) # N * F * K
+                    quadratic_output.append(tf.reshape(tf.norm(tf.reduce_sum(theta,axis=1),axis=1),shape=(-1,1))) # N * 1
+
+            else:
+                embedding_sum = tf.reduce_sum(self.embeddings,axis=1)
+                p = tf.matmul(tf.expand_dims(embedding_sum,2),tf.expand_dims(embedding_sum,1)) # N * K * K
+                for i in range(self.deep_init_size):
+                    theta = tf.multiply(p,tf.expand_dims(self.weights['product-quadratic-outer'][i],0)) # N * K * K
+                    quadratic_output.append(tf.reshape(tf.reduce_sum(theta,axis=[1,2]),shape=(-1,1))) # N * 1
+
+            self.lp = tf.concat(quadratic_output,axis=1) # N * init_deep_size
+
+            self.y_deep = tf.nn.relu(tf.add(tf.add(self.lz, self.lp), self.weights['product-bias']))
+            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
+
+
+            # Deep component
+            for i in range(0,len(self.deep_layers)):
+                self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["layer_%d" %i]), self.weights["bias_%d"%i])
+                self.y_deep = self.deep_layers_activation(self.y_deep)
+                self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
+
+
+
+            self.out = tf.add(tf.matmul(self.y_deep,self.weights['output']),self.weights['output_bias'])
+
+            # loss
+            if self.loss_type == "logloss":
+                self.out = tf.nn.sigmoid(self.out)
+                self.loss = tf.losses.log_loss(self.label, self.out)
+            elif self.loss_type == "mse":
+                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
+
+
+
+            if self.optimizer_type == "adam":
+                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
+                                                        epsilon=1e-8).minimize(self.loss)
+            elif self.optimizer_type == "adagrad":
+                self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
+                                                           initial_accumulator_value=1e-8).minimize(self.loss)
+            elif self.optimizer_type == "gd":
+                self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
+            elif self.optimizer_type == "momentum":
+                self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
+                    self.loss)
+
+
+            #init
+            self.saver = tf.train.Saver()
+            init = tf.global_variables_initializer()
+            self.sess = tf.Session()
+            self.sess.run(init)
+
+            # number of params
+            total_parameters = 0
+            for variable in self.weights.values():
+                shape = variable.get_shape()
+                variable_parameters = 1
+                for dim in shape:
+                    variable_parameters *= dim.value
+                total_parameters += variable_parameters
+            if self.verbose > 0:
+                print("#params: %d" % total_parameters)
+
+
+
+
+
+    def _initialize_weights(self):
+        weights = dict()
+
+        #embeddings
+        weights['feature_embeddings'] = tf.Variable(
+            tf.random_normal([self.feature_size,self.embedding_size],0.0,0.01),
+            name='feature_embeddings')
+        weights['feature_bias'] = tf.Variable(tf.random_normal([self.feature_size,1],0.0,1.0),name='feature_bias')
+
+
+        #Product Layers
+        if self.use_inner:
+            weights['product-quadratic-inner'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size],0.0,0.01))
+        else:
+            weights['product-quadratic-outer'] = tf.Variable(
+                tf.random_normal([self.deep_init_size, self.embedding_size,self.embedding_size], 0.0, 0.01))
+
+
+
+        weights['product-linear'] = tf.Variable(tf.random_normal([self.deep_init_size,self.field_size,self.embedding_size],0.0,0.01))
+        weights['product-bias'] = tf.Variable(tf.random_normal([self.deep_init_size,],0,0,1.0))
+        #deep layers
+        num_layer = len(self.deep_layers)
+        input_size = self.deep_init_size
+        glorot = np.sqrt(2.0/(input_size + self.deep_layers[0]))
+
+        weights['layer_0'] = tf.Variable(
+            np.random.normal(loc=0,scale=glorot,size=(input_size,self.deep_layers[0])),dtype=np.float32
+        )
+        weights['bias_0'] = tf.Variable(
+            np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32
+        )
+
+
+        for i in range(1,num_layer):
+            glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
+            weights["layer_%d" % i] = tf.Variable(
+                np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),
+                dtype=np.float32)  # layers[i-1] * layers[i]
+            weights["bias_%d" % i] = tf.Variable(
+                np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
+                dtype=np.float32)  # 1 * layer[i]
+
+
+        glorot = np.sqrt(2.0/(input_size + 1))
+        weights['output'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(self.deep_layers[-1],1)),dtype=np.float32)
+        weights['output_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32)
+
+
+        return weights
+
+
+    def get_batch(self,Xi,Xv,y,batch_size,index):
+        start = index * batch_size
+        end = (index + 1) * batch_size
+        end = end if end < len(y) else len(y)
+        return Xi[start:end],Xv[start:end],[[y_] for y_ in y[start:end]]
+
+    # shuffle three lists simutaneously
+    def shuffle_in_unison_scary(self, a, b, c):
+        rng_state = np.random.get_state()
+        np.random.shuffle(a)
+        np.random.set_state(rng_state)
+        np.random.shuffle(b)
+        np.random.set_state(rng_state)
+        np.random.shuffle(c)
+
+    def predict(self, Xi, Xv,y):
+        """
+        :param Xi: list of list of feature indices of each sample in the dataset
+        :param Xv: list of list of feature values of each sample in the dataset
+        :return: predicted probability of each sample
+        """
+        # dummy y
+        feed_dict = {self.feat_index: Xi,
+                     self.feat_value: Xv,
+                     self.label: y,
+                     self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
+                     self.train_phase: True}
+
+        loss = self.sess.run([self.loss], feed_dict=feed_dict)
+
+        return loss
+
+
+    def fit_on_batch(self,Xi,Xv,y):
+        feed_dict = {self.feat_index:Xi,
+                     self.feat_value:Xv,
+                     self.label:y,
+                     self.dropout_keep_deep:self.dropout_dep,
+                     self.train_phase:True}
+
+        loss,opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
+
+        return loss
+
+    def fit(self, Xi_train, Xv_train, y_train,
+            Xi_valid=None, Xv_valid=None, y_valid=None,
+            early_stopping=False, refit=False):
+        """
+        :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
+                         indi_j is the feature index of feature field j of sample i in the training set
+        :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
+                         vali_j is the feature value of feature field j of sample i in the training set
+                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
+        :param y_train: label of each sample in the training set
+        :param Xi_valid: list of list of feature indices of each sample in the validation set
+        :param Xv_valid: list of list of feature values of each sample in the validation set
+        :param y_valid: label of each sample in the validation set
+        :param early_stopping: perform early stopping or not
+        :param refit: refit the model on the train+valid dataset or not
+        :return: None
+        """
+        has_valid = Xv_valid is not None
+        for epoch in range(self.epoch):
+            t1 = time()
+            self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
+            total_batch = int(len(y_train) / self.batch_size)
+            for i in range(total_batch):
+                Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
+                self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
+
+            if has_valid:
+                y_valid = np.array(y_valid).reshape((-1,1))
+                loss = self.predict(Xi_valid, Xv_valid, y_valid)
+                print("epoch",epoch,"loss",loss)
\ No newline at end of file
--- a/RS-tf/README.md
+++ b/RS-tf/README.md
+## 实现一些推荐算法的模型demo
+
+笔记可以看：https://kaiyuanyokii2n.com/
\ No newline at end of file