# Factorization Machines with tf
## 根据user/items的id,建立稀疏矩阵
"from itertools import count\n",
"from collections import defaultdict\n",
"from scipy.sparse import csr\n",
"import numpy as np\n",
"def vectorize_dic(dic,ix=None,p=None,n=0,g=0):\n",
" # dic -- dictionary of feature lists. Keys are the name of features\n",
" # ix -- index generator (default None)\n",
" # p -- dimension of feature space (number of columns in the sparse matrix) (default None)\n",
" # n -- num sample\n",
" # g -- num group: eg: uese/items---> g=2\n",
" \n",
" if ix==None:\n",
" ix = dict()\n",
" \n",
" \n",
" nz = n * g # number of non-zores\n",
" col_ix = np.empty(nz,dtype = int)\n",
" i = 0\n",
" for k,lis in dic.items():\n",
" for t in range(len(lis)):\n",
" ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1\n",
" # 附加索引'l'以防止将具有相同id的不同列映射到同一个索引\n",
" col_ix[i+t*g] = ix[str(lis[t]) + str(k)]\n",
" i += 1\n",
" row_ix = np.repeat(np.arange(0,n),g)\n",
" data = np.ones(nz)\n",
" if p == None:\n",
" p = len(ix)\n",
" ixx = np.where(col_ix < p)\n",
" return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p))"
# Loading data
"import pandas as pd\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"cols = ['user','item','rating','timestamp']\n",
"train = pd.read_csv('data/ua.base',delimiter='\\t',names=cols)\n",
"test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n",
"x_train = vectorize_dic({'users':train['user'].values, 'items':train['item'].values},n=len(train.index),g=2)\n",
"x_test= vectorize_dic({'users':test['user'].values,'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)\n",
"y_train = train.rating.values\n",
"y_test = test.rating.values"
# Input To Dense
"x_train = x_train.todense()\n",
"x_test = x_test.todense()\n",
"print(x_train.shape, x_test.shape)"
# 用tensorflow定义FM模型
"# 初始化参数\n",
"import tensorflow as tf\n",
"n,p = x_train.shape\n",
"# number 0f latent factor\n",
"k = 10\n",
"x = tf.placeholder('float',[None,p])\n",
"y = tf.placeholder('float',[None,1])\n",
"# bias and weight\n",
"w0 = tf.Variable(tf.zeros([1]))\n",
"w = tf.Variable(tf.zeros([p]))\n",
"#interaction factors\n",
"v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))\n",
"y_hat = tf.Variable(tf.zeros([n, 1]))"
"## 定义输出y的计算公式\n",
"$$ \\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$"
"# 计算FM公式的输出\n",
"linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True))\n",
"pair_interactions = 0.5 * tf.reduce_sum(\n",
" tf.subtract(\n",
" tf.pow(tf.matmul(x,tf.transpose(v)),2),\n",
" tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))),axis=1, keep_dims=True)\n",
"y_hat = tf.add(linear_terms, pair_interactions)"
"# Loss function\n",
"$$ L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$"
"# L2 reg sum of squares of loss function\n",
"lambda_w = tf.constant(0.001, name='lambda_w')\n",
"lambda_v = tf.constant(0.001, name='lambda_v')\n",
"l2_norm = tf.reduce_sum(\n",
" tf.add(\n",
" tf.multiply(lambda_w, tf.pow(w,2)),\n",
" tf.multiply(lambda_v, tf.pow(v,2))))\n",
"error = tf.reduce_mean(tf.square(tf.subtract(y,y_hat)))\n",
"loss = tf.add(error,l2_norm)"
"# Optimization\n",
"用SGD进行优化: $\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$"
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)
# Mini-batcher
"def batcher(X_,y_=None,batch_size=-1):\n",
" n_samples = X_.shape[0]\n",
" if batch_size == -1:\n",
" batch_size = n_samples\n",
" if batch_size < 1:\n",
" raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n",
" \n",
" for i in range(0,n_samples,batch_size):\n",
" upper_bound = min(i + batch_size,n_samples)\n",
" ret_x = X_[i:upper_bound]\n",
" ret_y = None\n",
" if y_ is not None:\n",
" ret_y = y_[i:i + batch_size]\n",
" yield (ret_x,ret_y)"
# Tensorflow graph and traing
"from tqdm import tqdm_notebook as tqdm\n",
"epochs = 10\n",
"batch_size = 1000\n",
"# tf graph\n",
"init = tf.global_variables_initializer()\n",
"sess = tf.Session()\n",
"for epochs in tqdm(range(epochs),unit='epoch'):\n",
" perm = np.random.permutation(x_train.shape[0])\n",
" # iterate over batches\n",
" for bX,bY in batcher(x_train[perm],y_train[perm],batch_size):\n",
" sess.run(optimizer, feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)})"
# 评价模型
"cell_type": "code",
"errors = []\n",
"for bX,bY in batcher(x_test,y_test):\n",
" errors.append(sess.run(error,feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)}))\n",
"RMSE = np.sqrt(np.array(errors))\n",
Note: https://kaiyuanyokii2n.com/FM.html#more
## 实现一些推荐算法的模型demo
## 实现一些推荐算法的模型demo
