From 2eba5d90856987b1e32ae9f495e98d9279a44029 Mon Sep 17 00:00:00 2001
From: malin10 <malin10@baidu.com>
Date: Sat, 6 Jun 2020 00:09:08 +0800
Subject: [PATCH] add movie_recommand_demo

---
 core/trainers/single_infer.py                 |  17 +-
 demo/__init__.py                              |  13 ++
 demo/movie_recommand/__init__.py              |  13 ++
 .../data/online_user/users.dat                |   2 +
 demo/movie_recommand/data/process_ml_1m.py    | 146 ++++++++++++++++++
 demo/movie_recommand/data/split.py            |  51 ++++++
 demo/movie_recommand/data/test/log.data.hash  |   0
 demo/movie_recommand/data/train/log.data.hash |   0
 demo/movie_recommand/data_prepare.sh          |  18 +++
 demo/movie_recommand/offline_test.sh          |  12 ++
 demo/movie_recommand/online_rank.sh           |   8 +
 demo/movie_recommand/online_recall.sh         |   9 ++
 demo/movie_recommand/rank/__init__.py         |  13 ++
 demo/movie_recommand/rank/config.yaml         |  93 +++++++++++
 demo/movie_recommand/rank/model.py            | 120 ++++++++++++++
 demo/movie_recommand/recall/__init__.py       |  13 ++
 demo/movie_recommand/recall/config.yaml       |  93 +++++++++++
 demo/movie_recommand/recall/model.py          | 100 ++++++++++++
 demo/movie_recommand/train.sh                 |   8 +
 19 files changed, 728 insertions(+), 1 deletion(-)
 create mode 100755 demo/__init__.py
 create mode 100755 demo/movie_recommand/__init__.py
 create mode 100644 demo/movie_recommand/data/online_user/users.dat
 create mode 100644 demo/movie_recommand/data/process_ml_1m.py
 create mode 100644 demo/movie_recommand/data/split.py
 create mode 100644 demo/movie_recommand/data/test/log.data.hash
 create mode 100644 demo/movie_recommand/data/train/log.data.hash
 create mode 100644 demo/movie_recommand/data_prepare.sh
 create mode 100644 demo/movie_recommand/offline_test.sh
 create mode 100644 demo/movie_recommand/online_rank.sh
 create mode 100644 demo/movie_recommand/online_recall.sh
 create mode 100755 demo/movie_recommand/rank/__init__.py
 create mode 100755 demo/movie_recommand/rank/config.yaml
 create mode 100755 demo/movie_recommand/rank/model.py
 create mode 100755 demo/movie_recommand/recall/__init__.py
 create mode 100755 demo/movie_recommand/recall/config.yaml
 create mode 100755 demo/movie_recommand/recall/model.py
 create mode 100644 demo/movie_recommand/train.sh

diff --git a/core/trainers/single_infer.py b/core/trainers/single_infer.py
index d54e418c..fcc92e2e 100755
--- a/core/trainers/single_infer.py
+++ b/core/trainers/single_infer.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 import time
 import logging
 import os
+import json
+import numpy as np
 import paddle.fluid as fluid
 
 from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer
@@ -263,8 +265,10 @@ class SingleInfer(TranspileTrainer):
             envs.get_global_env("runner." + self._runner_name +
                                 ".print_interval", 20))
         metrics_format.append("{}: {{}}".format("batch"))
+        metrics_indexes = dict()
         for name, var in metrics.items():
             metrics_varnames.append(var.name)
+            metrics_indexes[var.name] = len(metrics_varnames) - 1
             metrics_format.append("{}: {{}}".format(name))
         metrics_format = ", ".join(metrics_format)
 
@@ -272,19 +276,30 @@ class SingleInfer(TranspileTrainer):
         reader.start()
         batch_id = 0
         scope = self._model[model_name][2]
+
+        infer_results = []
         with fluid.scope_guard(scope):
             try:
                 while True:
                     metrics_rets = self._exe.run(program=program,
-                                                 fetch_list=metrics_varnames)
+                                                 fetch_list=metrics_varnames,
+                                                 return_numpy=False)
                     metrics = [batch_id]
                     metrics.extend(metrics_rets)
 
+                    batch_infer_result = {}
+                    for k, v in metrics_indexes.items():
+                        batch_infer_result[k] = np.array(metrics_rets[
+                            v]).tolist()
+                    infer_results.append(batch_infer_result)
+
                     if batch_id % fetch_period == 0 and batch_id != 0:
                         print(metrics_format.format(*metrics))
                     batch_id += 1
             except fluid.core.EOFException:
                 reader.reset()
+        with open(model_dict['save_path'], 'w') as fout:
+            json.dump(infer_results, fout)
 
     def terminal(self, context):
         context['is_exit'] = True
diff --git a/demo/__init__.py b/demo/__init__.py
new file mode 100755
index 00000000..abf198b9
--- /dev/null
+++ b/demo/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/demo/movie_recommand/__init__.py b/demo/movie_recommand/__init__.py
new file mode 100755
index 00000000..abf198b9
--- /dev/null
+++ b/demo/movie_recommand/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/demo/movie_recommand/data/online_user/users.dat b/demo/movie_recommand/data/online_user/users.dat
new file mode 100644
index 00000000..e9649b70
--- /dev/null
+++ b/demo/movie_recommand/data/online_user/users.dat
@@ -0,0 +1,2 @@
+2181::M::25::0
+2073::F::18::4
diff --git a/demo/movie_recommand/data/process_ml_1m.py b/demo/movie_recommand/data/process_ml_1m.py
new file mode 100644
index 00000000..7125f625
--- /dev/null
+++ b/demo/movie_recommand/data/process_ml_1m.py
@@ -0,0 +1,146 @@
+#coding=utf8
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+import random
+import json
+user_fea = ["userid", "gender", "age", "occupation"]
+movie_fea = ["movieid", "title", "genres"]
+rating_fea = ["userid", "movieid", "rating", "time"]
+dict_size = 60000000
+hash_dict = dict()
+
+data_path = "ml-1m"
+test_user_path = "online_user"
+
+
+def process(path):
+    user_dict = parse_data(data_path + "/users.dat", user_fea)
+    movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)
+
+    for line in open(path):
+        line = line.strip()
+        arr = line.split("::")
+        userid = arr[0]
+        movieid = arr[1]
+        out_str = "time:%s\t%s\t%s\tlabel:%s" % (arr[3], user_dict[userid],
+                                                 movie_dict[movieid], arr[2])
+        log_id = hash(out_str) % 1000000000
+        print "%s\t%s" % (log_id, out_str)
+
+
+def parse_data(file_name, feas):
+    dict = {}
+    for line in open(file_name):
+        line = line.strip()
+        arr = line.split("::")
+        out_str = ""
+        for i in range(0, len(feas)):
+            out_str += "%s:%s\t" % (feas[i], arr[i])
+
+        dict[arr[0]] = out_str.strip()
+    return dict
+
+
+def parse_movie_data(file_name, feas):
+    dict = {}
+    for line in open(file_name):
+        line = line.strip()
+        arr = line.split("::")
+        title_str = ""
+        genres_str = ""
+
+        for term in arr[1].split(" "):
+            term = term.strip()
+            if term != "":
+                title_str += "%s " % (term)
+        for term in arr[2].split("|"):
+            term = term.strip()
+            if term != "":
+                genres_str += "%s " % (term)
+        out_str = "movieid:%s\ttitle:%s\tgenres:%s" % (
+            arr[0], title_str.strip(), genres_str.strip())
+        dict[arr[0]] = out_str.strip()
+    return dict
+
+
+def to_hash(in_str):
+    feas = in_str.split(":")[0]
+    arr = in_str.split(":")[1]
+    out_str = "%s:%s" % (feas, (arr + arr[::-1] + arr[::-2] + arr[::-3]))
+    hash_id = hash(out_str) % dict_size
+    if hash_id in hash_dict and hash_dict[hash_id] != out_str:
+        print(hash_id, out_str, hash(out_str))
+        print("conflict")
+        exit(-1)
+
+    return "%s:%s" % (feas, hash_id)
+
+
+def to_hash_list(in_str):
+    arr = in_str.split(":")
+    tmp_arr = arr[1].split(" ")
+    out_str = ""
+    for item in tmp_arr:
+        item = item.strip()
+        if item != "":
+            key = "%s:%s" % (arr[0], item)
+            out_str += "%s " % (to_hash(key))
+    return out_str.strip()
+
+
+def get_hash(path):
+    #0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345  7-title:Carrie (1976)  8-genres:Horror  9-label:2
+    for line in open(path):
+        arr = line.strip().split("\t")
+        out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \
+                 (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \
+                 to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9])
+        print out_str
+
+
+def generate_online_user():
+    movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)
+
+    with open(test_user_path + "/movies.dat", 'w') as f:
+        for line in open(test_user_path + "/users.dat"):
+            line = line.strip()
+            arr = line.split("::")
+            userid = arr[0]
+            for item in movie_dict:
+                f.write(userid + "::" + item + "::1")
+                f.write("\n")
+
+
+def generate_online_data(path):
+    user_dict = parse_data(data_path + "/users.dat", user_fea)
+    movie_dict = parse_movie_data(data_path + "/movies.dat", movie_fea)
+
+    for line in open(path):
+        line = line.strip()
+        arr = line.split("::")
+        userid = arr[0]
+        movieid = arr[1]
+        label = arr[2]
+        out_str = "time:%s\t%s\t%s\tlabel:%s" % ("1", user_dict[userid],
+                                                 movie_dict[movieid], label)
+        log_id = hash(out_str) % 1000000000
+        res = "%s\t%s" % (log_id, out_str)
+        arr = res.strip().split("\t")
+        out_str = "logid:%s %s %s %s %s %s %s %s %s %s" % \
+                (arr[0], arr[1], to_hash(arr[2]), to_hash(arr[3]), to_hash(arr[4]), to_hash(arr[5]), \
+                to_hash(arr[6]), to_hash_list(arr[7]), to_hash_list(arr[8]), arr[9])
+        print(out_str)
+
+
+if __name__ == "__main__":
+    random.seed(1111111)
+    if sys.argv[1] == "process_raw":
+        process(sys.argv[2])
+    elif sys.argv[1] == "hash":
+        get_hash(sys.argv[2])
+    elif sys.argv[1] == "data_recall":
+        generate_online_user()
+        generate_online_data(test_user_path + "/movies.dat")
+    elif sys.argv[1] == "data_rank":
+        generate_online_data(test_user_path + "/movies.dat")
diff --git a/demo/movie_recommand/data/split.py b/demo/movie_recommand/data/split.py
new file mode 100644
index 00000000..9c0a7fd0
--- /dev/null
+++ b/demo/movie_recommand/data/split.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+train = dict()
+test = dict()
+data_path = "ml-1m"
+
+for line in open(data_path + "/ratings.dat"):
+    fea = line.rstrip().split("::")
+    if fea[0] not in train:
+        train[fea[0]] = [line]
+    elif fea[0] not in test:
+        test[fea[0]] = dict()
+        test[fea[0]]['time'] = int(fea[3])
+        test[fea[0]]['content'] = line
+    else:
+        time = int(fea[3])
+        if time <= test[fea[0]]['time']:
+            train[fea[0]].append(line)
+        else:
+            train[fea[0]].append(test[fea[0]]['content'])
+            test[fea[0]]['time'] = time
+            test[fea[0]]['content'] = line
+
+train_data = []
+for key in train:
+    for line in train[key]:
+        train_data.append(line)
+
+random.shuffle(train_data)
+
+with open(data_path + "/train.dat", 'w') as f:
+    for line in train_data:
+        f.write(line)
+
+with open(data_path + "/test.dat", 'w') as f:
+    for key in test:
+        f.write(test[key]['content'])
diff --git a/demo/movie_recommand/data/test/log.data.hash b/demo/movie_recommand/data/test/log.data.hash
new file mode 100644
index 00000000..e69de29b
diff --git a/demo/movie_recommand/data/train/log.data.hash b/demo/movie_recommand/data/train/log.data.hash
new file mode 100644
index 00000000..e69de29b
diff --git a/demo/movie_recommand/data_prepare.sh b/demo/movie_recommand/data_prepare.sh
new file mode 100644
index 00000000..b5dd3e07
--- /dev/null
+++ b/demo/movie_recommand/data_prepare.sh
@@ -0,0 +1,18 @@
+cd data
+
+wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
+unzip ml-1m.zip
+
+python split.py
+
+mkdir train/
+mkdir test/
+
+python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train
+python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test
+python process_ml_1m.py hash log.data.train > ./train/log.data.hash
+python process_ml_1m.py hash log.data.test > ./test/log.data.hash
+
+rm log.data.train
+rm log.data.test
+cd ../
diff --git a/demo/movie_recommand/offline_test.sh b/demo/movie_recommand/offline_test.sh
new file mode 100644
index 00000000..f4b83c51
--- /dev/null
+++ b/demo/movie_recommand/offline_test.sh
@@ -0,0 +1,12 @@
+## modify config.yaml to infer mode at first
+
+cd recall
+python -m paddlerec.run -m ./config.yaml
+cd ../rank
+python -m paddlerec.run -m ./config.yaml
+cd ..
+
+echo "recall offline test result:"
+python parse.py recall_offline recall/infer_result
+echo "rank offline test result:"
+python parse.py recall_offline rank/infer_result
diff --git a/demo/movie_recommand/online_rank.sh b/demo/movie_recommand/online_rank.sh
new file mode 100644
index 00000000..f2f5f167
--- /dev/null
+++ b/demo/movie_recommand/online_rank.sh
@@ -0,0 +1,8 @@
+cd data
+python process_ml_1m.py data_rank > online_user/test/data.txt
+
+## modify recall/config.yaml to online_infer mode
+cd ../rank
+python -m paddlerec.run -m ./config.yaml
+cd ../
+python parse.py rank_online rank/infer_result
diff --git a/demo/movie_recommand/online_recall.sh b/demo/movie_recommand/online_recall.sh
new file mode 100644
index 00000000..23fa7912
--- /dev/null
+++ b/demo/movie_recommand/online_recall.sh
@@ -0,0 +1,9 @@
+cd data
+mkdir online_user/test
+python process_ml_1m.py data_recall > online_user/test/data.txt
+
+## modify recall/config.yaml to online_infer mode
+cd ../recall
+python -m paddlerec.run -m ./config.yaml
+cd ../
+python parse.py recall_online recall/infer_result
diff --git a/demo/movie_recommand/rank/__init__.py b/demo/movie_recommand/rank/__init__.py
new file mode 100755
index 00000000..abf198b9
--- /dev/null
+++ b/demo/movie_recommand/rank/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/demo/movie_recommand/rank/config.yaml b/demo/movie_recommand/rank/config.yaml
new file mode 100755
index 00000000..4bf1b325
--- /dev/null
+++ b/demo/movie_recommand/rank/config.yaml
@@ -0,0 +1,93 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+workspace: "demo/movie_recommand"
+
+# list of dataset
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+  batch_size: 128
+  type: QueueDataset 
+  data_path: "{workspace}/data/train"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+- name: dataset_infer # name
+  batch_size: 128
+  type: DataLoader
+  data_path: "{workspace}/data/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+- name: dataset_online_infer # name
+  batch_size: 10
+  type: DataLoader
+  data_path: "{workspace}/data/online_user/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_feature_number: 60000000
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+
+# train
+mode: runner_train
+
+## online or offline infer
+#mode: runner_infer
+runner:
+- name: runner_train
+  class: single_train
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_inference_interval: 1 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  epochs: 10
+  device: cpu
+
+- name: runner_infer
+  epochs: 1
+  class: single_infer
+  print_interval: 10000
+  init_model_path: "increment/9" # load model path
+
+#train
+phase:
+- name: phase1
+  model: "{workspace}/model.py" # user-defined model
+  dataset_name: dataset_train # select dataset by name
+  thread_num: 12
+
+##offline infer
+#phase:
+#- name: phase1
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_infer # select dataset by name
+#  save_path: "./infer_result"
+#  thread_num: 1
+
+##offline infer
+#phase:
+#- name: phase1
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_online_infer # select dataset by name
+#  save_path: "./infer_result"
+#  thread_num: 1
diff --git a/demo/movie_recommand/rank/model.py b/demo/movie_recommand/rank/model.py
new file mode 100755
index 00000000..2393e354
--- /dev/null
+++ b/demo/movie_recommand/rank/model.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle.fluid as fluid
+
+from paddlerec.core.utils import envs
+from paddlerec.core.model import Model as ModelBase
+
+
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+
+    def _init_hyper_parameters(self):
+        self.is_distributed = True if envs.get_trainer(
+        ) == "CtrTrainer" else False
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number")
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim")
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.learning_rate")
+        self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
+
+    def net(self, input, is_infer=False):
+        self.user_sparse_inputs = self._sparse_data_var[2:6]
+        self.mov_sparse_inputs = self._sparse_data_var[6:9]
+
+        self.label_input = self._sparse_data_var[-1]
+
+        def fc(input):
+            fcs = [input]
+            for size in self.hidden_layers:
+                output = fluid.layers.fc(
+                    input=fcs[-1],
+                    size=size,
+                    act='relu',
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Normal(
+                            scale=1.0 / math.sqrt(fcs[-1].shape[1]))))
+                fcs.append(output)
+            return fcs[-1]
+
+        def embedding_layer(input):
+            emb = fluid.layers.embedding(
+                input=input,
+                is_sparse=True,
+                is_distributed=self.is_distributed,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=fluid.ParamAttr(
+                    name="emb", initializer=fluid.initializer.Uniform()), )
+            emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+            return emb_sum
+
+        user_sparse_embed_seq = list(
+            map(embedding_layer, self.user_sparse_inputs))
+        mov_sparse_embed_seq = list(
+            map(embedding_layer, self.mov_sparse_inputs))
+        concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1)
+        concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1)
+
+        usr_combined_features = fc(concated_user)
+        mov_combined_features = fc(concated_mov)
+
+        fc_input = fluid.layers.concat(
+            [usr_combined_features, mov_combined_features], axis=1)
+        sim = fluid.layers.fc(
+            input=fc_input,
+            size=1,
+            act='sigmoid',
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                scale=1.0 / math.sqrt(fc_input.shape[1]))))
+
+        predict = fluid.layers.scale(sim, scale=5)
+        self.predict = predict
+        #auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
+        #                                     label=self.label_input,
+        #                                     num_thresholds=10000,
+        #                                     slide_steps=20)
+
+        if is_infer:
+            self._infer_results["user_feature"] = usr_combined_features
+            self._infer_results["movie_feature"] = mov_combined_features
+            self._infer_results["uid"] = self._sparse_data_var[2]
+            self._infer_results["movieid"] = self._sparse_data_var[6]
+            self._infer_results["label"] = self._sparse_data_var[-1]
+            self._infer_results["predict"] = self.predict
+            return
+
+        #self._metrics["AUC"] = auc
+        #self._metrics["BATCH_AUC"] = batch_auc
+        #cost = fluid.layers.cross_entropy(
+        #    input=self.predict, label=self.label_input)
+        cost = fluid.layers.square_error_cost(
+            self.predict,
+            fluid.layers.cast(
+                x=self.label_input, dtype='float32'))
+        avg_cost = fluid.layers.reduce_mean(cost)
+        self._cost = avg_cost
+        self._metrics["LOSS"] = avg_cost
+
+    def optimizer(self):
+        optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True)
+        return optimizer
+
+    def infer_net(self):
+        pass
diff --git a/demo/movie_recommand/recall/__init__.py b/demo/movie_recommand/recall/__init__.py
new file mode 100755
index 00000000..abf198b9
--- /dev/null
+++ b/demo/movie_recommand/recall/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/demo/movie_recommand/recall/config.yaml b/demo/movie_recommand/recall/config.yaml
new file mode 100755
index 00000000..056e2674
--- /dev/null
+++ b/demo/movie_recommand/recall/config.yaml
@@ -0,0 +1,93 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+workspace: "demo/movie_recommand"
+
+# list of dataset
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+  batch_size: 128
+  type: QueueDataset 
+  data_path: "{workspace}/data/train"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+- name: dataset_infer # name
+  batch_size: 128
+  type: DataLoader
+  data_path: "{workspace}/data/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+- name: dataset_online_infer # name
+  batch_size: 128
+  type: DataLoader
+  data_path: "{workspace}/data/online_user/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_feature_number: 60000000
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+
+# train
+mode: runner_train
+
+## online or offline infer
+#mode: runner_infer
+runner:
+- name: runner_train
+  class: single_train
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_inference_interval: 1 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  epochs: 10
+  device: cpu
+
+- name: runner_infer
+  epochs: 1
+  class: single_infer
+  print_interval: 10000
+  init_model_path: "increment/9" # load model path
+
+#train
+phase:
+- name: phase1
+  model: "{workspace}/model.py" # user-defined model
+  dataset_name: dataset_train # select dataset by name
+  thread_num: 12
+
+##offline infer
+#phase:
+#- name: phase1
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_infer # select dataset by name
+#  save_path: "./infer_result"
+#  thread_num: 1
+
+##offline infer
+#phase:
+#- name: phase1
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_online_infer # select dataset by name
+#  save_path: "./infer_result"
+#  thread_num: 1
diff --git a/demo/movie_recommand/recall/model.py b/demo/movie_recommand/recall/model.py
new file mode 100755
index 00000000..13773ef5
--- /dev/null
+++ b/demo/movie_recommand/recall/model.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle.fluid as fluid
+
+from paddlerec.core.utils import envs
+from paddlerec.core.model import Model as ModelBase
+
+
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+
+    def _init_hyper_parameters(self):
+        self.is_distributed = True if envs.get_trainer(
+        ) == "CtrTrainer" else False
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number")
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim")
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.learning_rate")
+        self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
+
+    def net(self, input, is_infer=False):
+        self.user_sparse_inputs = self._sparse_data_var[2:6]
+        self.mov_sparse_inputs = self._sparse_data_var[6:9]
+
+        self.label_input = self._sparse_data_var[-1]
+
+        def fc(input):
+            fcs = [input]
+            for size in self.hidden_layers:
+                output = fluid.layers.fc(
+                    input=fcs[-1],
+                    size=size,
+                    act='relu',
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Normal(
+                            scale=1.0 / math.sqrt(fcs[-1].shape[1]))))
+                fcs.append(output)
+            return fcs[-1]
+
+        def embedding_layer(input):
+            emb = fluid.layers.embedding(
+                input=input,
+                is_sparse=True,
+                is_distributed=self.is_distributed,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=fluid.ParamAttr(
+                    name="emb", initializer=fluid.initializer.Uniform()), )
+            emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+            return emb_sum
+
+        user_sparse_embed_seq = list(
+            map(embedding_layer, self.user_sparse_inputs))
+        mov_sparse_embed_seq = list(
+            map(embedding_layer, self.mov_sparse_inputs))
+        concated_user = fluid.layers.concat(user_sparse_embed_seq, axis=1)
+        concated_mov = fluid.layers.concat(mov_sparse_embed_seq, axis=1)
+
+        usr_combined_features = fc(concated_user)
+        mov_combined_features = fc(concated_mov)
+
+        sim = fluid.layers.cos_sim(
+            X=usr_combined_features, Y=mov_combined_features)
+        predict = fluid.layers.scale(sim, scale=5)
+        self.predict = predict
+
+        if is_infer:
+            self._infer_results["uid"] = self._sparse_data_var[2]
+            self._infer_results["movieid"] = self._sparse_data_var[6]
+            self._infer_results["label"] = self._sparse_data_var[-1]
+            self._infer_results["predict"] = self.predict
+            return
+
+        cost = fluid.layers.square_error_cost(
+            self.predict,
+            fluid.layers.cast(
+                x=self.label_input, dtype='float32'))
+        avg_cost = fluid.layers.reduce_mean(cost)
+        self._cost = avg_cost
+        self._metrics["LOSS"] = avg_cost
+
+    def optimizer(self):
+        optimizer = fluid.optimizer.Adam(self.learning_rate, lazy_mode=True)
+        return optimizer
diff --git a/demo/movie_recommand/train.sh b/demo/movie_recommand/train.sh
new file mode 100644
index 00000000..4eb53f45
--- /dev/null
+++ b/demo/movie_recommand/train.sh
@@ -0,0 +1,8 @@
+cd recall
+python -m paddlerec.run -m ./config.yaml
+cd ../rank
+python -m paddlerec.run -m ./config.yaml &> train_log &
+cd ..
+
+echo "recall offline test: "
+python infer_analys
-- 
GitLab