add deep fm

0da6629b · tangwei12 · 768112b4 · 0da6629b · 0da6629b · 0da6629b
4 changed file
--- a/fluid/PaddleRec/ctr/cloud.py
+++ b/fluid/PaddleRec/ctr/cloud.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ======================================================================
+#
+# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
+#
+# ======================================================================
+import os
+import logging
+import hdfs_utils
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("cloud")
+logger.setLevel(logging.INFO)
+def run():
+    cmd = "python -u train.py "
+    cmd += " --train_data_path %s " % "data/train.txt"
+    cmd += " --test_data_path %s " % "data/test.txt"
+    if os.getenv("BATCH_SIZE", ""):
+        cmd += " --batch_size %s " % os.getenv("BATCH_SIZE")
+    if os.getenv("EMBEDDING_SIZE", ""):
+        cmd += " --embedding_size %s " % os.getenv("EMBEDDING_SIZE")
+    if os.getenv("NUM_PASSES", ""):
+        cmd += " --num_passes %s " % os.getenv("NUM_PASSES")
+    if os.getenv("MODEL_OUTPUT_DIR", ""):
+        cmd += " --model_output_dir %s " % os.getenv("MODEL_OUTPUT_DIR")
+    if os.getenv("SPARSE_FEATURE_DIM", ""):
+        cmd += " --sparse_feature_dim %s " % os.getenv("SPARSE_FEATURE_DIM")
+    if os.getenv("ASYNC_MODE", ""):
+        cmd += " --async_mode "
+    if os.getenv("NO_SPLIT_VAR", ""):
+        cmd += " --no_split_var "
+    is_local = int(os.getenv("PADDLE_IS_LOCAL", "1"))
+    if is_local:
+        cmd += " --is_local 1 "
+        cmd += " --cloud_train 0 "
+    else:
+        cmd += " --is_local 0 "
+        cmd += " --cloud_train 1 "
+        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
+        trainers = int(os.environ["PADDLE_TRAINERS"])
+        training_role = os.environ["PADDLE_TRAINING_ROLE"]
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+        if training_role == "PSERVER":
+            cmd += " --role pserver "
+        else:
+            cmd += " --role trainer "
+        cmd += " --endpoints %s " % pserver_endpoints
+        cmd += " --current_endpoint %s " % current_endpoint
+        cmd += " --trainer_id %s " % trainer_id
+        cmd += " --trainers %s " % trainers
+    logging.info("run cluster commands: {}".format(cmd))
+    exit(os.system(cmd))
+def download():
+    hadoop_home = os.getenv("HADOOP_HOME")
+    configs = {}
+    configs["fs.default.name"] = os.getenv("DATA_FS_NAME")
+    configs["hadoop.job.ugi"] = os.getenv("DATA_FS_UGI")
+    client = hdfs_utils.HDFSClient(hadoop_home, configs)
+    local_train_data_dir = os.getenv("TRAIN_DATA_LOCAL", "data")
+    hdfs_train_data_dir = os.getenv("TRAIN_DATA_HDFS", "")
+    downloads = hdfs_utils.multi_download(client, hdfs_train_data_dir, local_train_data_dir, 0, 1, multi_processes=1)
+    print(downloads)
+    for d in downloads:
+        base_dir = os.path.dirname(d)
+        tar_cmd = "tar -zxvf {} -C {}".format(d, base_dir)
+        print tar_cmd
+    for d in downloads:
+        base_dir = os.path.dirname(d)
+        tar_cmd = "tar -zxvf {} -C {}".format(d, base_dir)
+        logging.info("DOWNLOAD DATA: {}, AND TAR IT: {}".format(d, tar_cmd))
+        os.system(tar_cmd)
+def env_declar():
+    logging.info("********  Rename Cluster Env to PaddleFluid Env ********")
+    if os.environ["TRAINING_ROLE"] == "PSERVER" or os.environ["PADDLE_IS_LOCAL"] == "0":
+        os.environ["PADDLE_TRAINING_ROLE"] = os.environ["TRAINING_ROLE"]
+        os.environ["PADDLE_PSERVER_PORT"] = os.environ["PADDLE_PORT"]
+        os.environ["PADDLE_PSERVER_IPS"] = os.environ["PADDLE_PSERVERS"]
+        os.environ["PADDLE_TRAINERS"] = os.environ["PADDLE_TRAINERS_NUM"]
+        os.environ["PADDLE_CURRENT_IP"] = os.environ["POD_IP"]
+        os.environ["PADDLE_TRAINER_ID"] = os.environ["PADDLE_TRAINER_ID"]
+    os.environ["CPU_NUM"] = os.getenv("CPU_NUM", "12")
+    os.environ["NUM_THREADS"] = os.getenv("NUM_THREADS", "12")
+    logging.info("Content-Type: text/plain\n\n")
+    for key in os.environ.keys():
+        logging.info("%30s %s \n" % (key, os.environ[key]))
+    logging.info("******  Rename Cluster Env to PaddleFluid Env END ******")
+if __name__ == '__main__':
+    env_declar()
+    if os.getenv("NEED_CUSTOM_DOWNLOAD", ""):
+        if os.environ["PADDLE_TRAINING_ROLE"] == "PSERVER":
+            logging.info("PSERVER do not need to download datas")
+        else:
+            logging.info("NEED_CUSTOM_DOWNLOAD is True, will download train data with hdfs_utils")
+            download()
+    run()
--- a/fluid/PaddleRec/ctr/hdfs_utils.py
+++ b/fluid/PaddleRec/ctr/hdfs_utils.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HDFS Utils"""
+import os
+import sys
+import subprocess
+import multiprocessing
+from datetime import datetime
+import re
+import copy
+import errno
+import logging
+__all__ = ["HDFSClient", "multi_download"]
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+_logger = logging.getLogger("hdfs_utils")
+_logger.setLevel(logging.INFO)
+class HDFSClient(object):
+    def __init__(self, hadoop_home, configs):
+        self.pre_commands = []
+        hadoop_bin = '%s/bin/hadoop' % hadoop_home
+        self.pre_commands.append(hadoop_bin)
+        dfs = 'fs'
+        self.pre_commands.append(dfs)
+        for k, v in configs.iteritems():
+            config_command = '-D%s=%s' % (k, v)
+            self.pre_commands.append(config_command)
+    def __run_hdfs_cmd(self, commands, retry_times=5):
+        whole_commands = copy.deepcopy(self.pre_commands)
+        whole_commands.extend(commands)
+        print('Running system command: {0}'.format(' '.join(whole_commands)))
+        ret_code = 0
+        ret_out = None
+        ret_err = None
+        for x in range(retry_times + 1):
+            proc = subprocess.Popen(
+                whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            (output, errors) = proc.communicate()
+            ret_code, ret_out, ret_err = proc.returncode, output, errors
+            if ret_code:
+                _logger.warn(
+                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
+                    % (x, ' '.join(whole_commands), proc.returncode, errors))
+            else:
+                break
+        return ret_code, ret_out, ret_err
+    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
+        """
+            upload the local file to hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+            return:
+                True or False
+        """
+        assert hdfs_path is not None
+        assert local_path is not None and os.path.exists(local_path)
+        if os.path.isdir(local_path):
+            _logger.warn(
+                "The Local path: {} is dir and I will support it later, return".
+                    format(local_path))
+            return
+        base = os.path.basename(local_path)
+        if not self.is_exist(hdfs_path):
+            self.makedirs(hdfs_path)
+        else:
+            if self.is_exist(os.path.join(hdfs_path, base)):
+                if overwrite:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is True, delete it".
+                            format(hdfs_path))
+                    self.delete(hdfs_path)
+                else:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is False, return".
+                            format(hdfs_path))
+                    return False
+        put_commands = ["-put", local_path, hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
+                                                         retry_times)
+        if returncode:
+            _logger.error("Put local path: {} to HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Put local path: {} to HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
+        """
+            download from hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: remote dir on hdfs
+            return:
+                True or False
+        """
+        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
+        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
+        if not self.is_exist(hdfs_path):
+            print("HDFS path: {} do not exist".format(hdfs_path))
+            return False
+        if self.is_dir(hdfs_path):
+            _logger.error(
+                "The HDFS path: {} is dir and I will support it later, return".
+                    format(hdfs_path))
+        if os.path.exists(local_path):
+            base = os.path.basename(hdfs_path)
+            local_file = os.path.join(local_path, base)
+            if os.path.exists(local_file):
+                if overwrite:
+                    os.remove(local_file)
+                else:
+                    _logger.error(
+                        "The Local path: {} is exist and overwrite is False, return".
+                            format(local_file))
+                    return False
+        self.make_local_dirs(local_path)
+        download_commands = ["-get", hdfs_path, local_path]
+        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
+        if returncode:
+            _logger.error("Get local path: {} from HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Get local path: {} from HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+    def is_exist(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+        exist_cmd = ['-test', '-e', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            exist_cmd, retry_times=1)
+        if returncode:
+            _logger.error("HDFS is_exist HDFS path: {} failed".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
+                hdfs_path))
+            return True
+    def is_dir(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+        if not self.is_exist(hdfs_path):
+            return False
+        dir_cmd = ['-test', '-d', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
+        if returncode:
+            _logger.error("HDFS path: {} failed is not a directory".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} successfully is a directory".format(
+                hdfs_path))
+            return True
+    def delete(self, hdfs_path):
+        """Remove a file or directory from HDFS.
+        :param hdfs_path: HDFS path.
+        :param recursive: Recursively delete files and directories. By default,
+          this method will raise an :class:`HdfsError` if trying to delete a
+          non-empty directory.
+        This function returns `True` if the deletion was successful and `False` if
+        no file or directory previously existed at `hdfs_path`.
+        """
+        _logger.info('Deleting %r.', hdfs_path)
+        if not self.is_exist(hdfs_path):
+            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
+            return True
+        if self.is_dir(hdfs_path):
+            del_cmd = ['-rmr', hdfs_path]
+        else:
+            del_cmd = ['-rm', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
+        if returncode:
+            _logger.error("HDFS path: {} delete files failure".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} delete files successfully".format(
+                hdfs_path))
+            return True
+    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
+        """Move a file or folder.
+        :param hdfs_src_path: Source path.
+        :param hdfs_dst_path: Destination path. If the path already exists and is
+          a directory, the source will be moved into it. If the path exists and is
+          a file, or if a parent destination directory is missing, this method will
+          raise an :class:`HdfsError`.
+        """
+        assert hdfs_src_path is not None
+        assert hdfs_dst_path is not None
+        if not self.is_exist(hdfs_src_path):
+            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
+        if self.is_exist(hdfs_dst_path) and not overwrite:
+            _logger.error("HDFS path is exist: {} and overwrite=False".format(
+                hdfs_dst_path))
+        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            rename_command, retry_times=1)
+        if returncode:
+            _logger.error("HDFS rename path: {} to {} failed".format(
+                hdfs_src_path, hdfs_dst_path))
+            return False
+        else:
+            _logger.info("HDFS rename path: {} to {} successfully".format(
+                hdfs_src_path, hdfs_dst_path))
+            return True
+    @staticmethod
+    def make_local_dirs(local_path):
+        try:
+            os.makedirs(local_path)
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+    def makedirs(self, hdfs_path):
+        """Create a remote directory, recursively if necessary.
+        :param hdfs_path: Remote path. Intermediate directories will be created
+          appropriately.
+        """
+        _logger.info('Creating directories to %r.', hdfs_path)
+        assert hdfs_path is not None
+        if self.is_exist(hdfs_path):
+            _logger.error("HDFS path is exist: {}".format(hdfs_path))
+            return
+        mkdirs_commands = ['-mkdir', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            mkdirs_commands, retry_times=1)
+        if returncode:
+            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
+            return False
+        else:
+            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
+            return True
+    def ls(self, hdfs_path):
+        assert hdfs_path is not None
+        if not self.is_exist(hdfs_path):
+            return []
+        ls_commands = ['-ls', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+        if returncode:
+            _logger.error("HDFS list path: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
+            ret_lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    ret_lines.append(re_line[7])
+            return ret_lines
+    def lsr(self, hdfs_path, only_file=True, sort=True):
+        def sort_by_time(v1, v2):
+            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
+            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
+            return v1_time > v2_time
+        assert hdfs_path is not None
+        if not self.is_exist(hdfs_path):
+            return []
+        ls_commands = ['-lsr', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+        if returncode:
+            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list all files: {} successfully".format(
+                hdfs_path))
+            lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    if only_file and re_line[0][0] == "d":
+                        continue
+                    else:
+                        lines.append((re_line[7], re_line[5] + " " + re_line[6]))
+            if sort:
+                sorted(lines, cmp=sort_by_time)
+            ret_lines = [ret[0] for ret in lines]
+            return ret_lines
+def multi_download(client,
+                   hdfs_path,
+                   local_path,
+                   trainer_id,
+                   trainers,
+                   multi_processes=5):
+    """
+    multi_download
+    :param client: instance of HDFSClient
+    :param hdfs_path: path on hdfs
+    :param local_path: path on local
+    :param trainer_id: current trainer id
+    :param trainers: all trainers number
+    :param multi_processes: the download data process at the same time, default=5
+    :return: None
+    """
+    def __subprocess_download(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+            local_re_path = os.path.join(local_path, re_path)
+            client.download(data, local_re_path)
+    assert isinstance(client, HDFSClient)
+    client.make_local_dirs(local_path)
+    _logger.info("Make local dir {} successfully".format(local_path))
+    all_need_download = client.lsr(hdfs_path, sort=True)
+    need_download = all_need_download[trainer_id::trainers]
+    _logger.info("Get {} files From all {} files need to be download from {}".
+                 format(len(need_download), len(all_need_download), hdfs_path))
+    _logger.info("Start {} multi process to download datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = need_download[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_download, args=(process_datas,))
+        procs.append(p)
+        p.start()
+    # complete the processes
+    for proc in procs:
+        proc.join()
+    _logger.info("Finish {} multi process to download datas".format(
+        multi_processes))
+    local_downloads = []
+    for data in need_download:
+        data_name = os.path.basename(data)
+        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+        if re_path == ".":
+            local_re_path = os.path.join(local_path, data_name)
+        else:
+            local_re_path = os.path.join(local_path, re_path, data_name)
+        local_downloads.append(local_re_path)
+    return local_downloads
+def getfilelist(path):
+    rlist = []
+    for dir,folder,file in os.walk(path):
+        for i in file:
+            t = os.path.join(dir,i)
+            rlist.append(t)
+    for r in rlist:
+        print(r)
+def multi_upload(client,
+           hdfs_path,
+           local_path,
+           multi_processes=5,
+           overwrite=False,
+           sync=True):
+    """
+    :param client:
+    :param hdfs_path:
+    :param local_path:
+    :param sync:
+    :return:
+    """
+    def __subprocess_upload(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), local_path)
+            hdfs_re_path = os.path.join(hdfs_path, re_path)
+            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
+    def get_local_files(path):
+        rlist = []
+        if not os.path.isdir(path):
+            return rlist
+        for dirname, folder, files in os.walk(path):
+            for i in files:
+                t = os.path.join(dirname, i)
+                rlist.append(t)
+        return rlist
+    assert isinstance(client, HDFSClient)
+    all_files = get_local_files(local_path)
+    if not all_files:
+        _logger.info("there are nothing need to upload, exit")
+        return
+    _logger.info("Start {} multi process to upload datas".format(multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = all_files[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_upload, args=(process_datas,))
+        procs.append(p)
+        p.start()
+    # complete the processes
+    for proc in procs:
+        proc.join()
+    _logger.info("Finish {} multi process to upload datas".format(
+        multi_processes))
+if __name__ == "__main__":
+    pass
\ No newline at end of file
--- a/fluid/PaddleRec/ctr/network_conf.py
+++ b/fluid/PaddleRec/ctr/network_conf.py
@@ -4,15 +4,50 @@ import math
 dense_feature_dim = 13
-def ctr_dnn_model(embedding_size, sparse_feature_dim):
+def ctr_deepfm_model(factor_size, sparse_feature_dim, dense_feature_dim, sparse_input):
-    dense_input = fluid.layers.data(
+    def dense_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
-        name="dense_input", shape=[dense_feature_dim], dtype='float32')
+        """
+        dense_fm_layer
+        """
+        first_order = fluid.layers.fc(input=input, size=1)
+        emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
+                                                  dtype='float32', attr=fm_param_attr)
+        input_mul_factor = fluid.layers.matmul(input, emb_table)
+        input_mul_factor_square = fluid.layers.square(input_mul_factor)
+        input_square = fluid.layers.square(input)
+        factor_square = fluid.layers.square(emb_table)
+        input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
+        second_order = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
+        return first_order, second_order
+    def sparse_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
+        """
+        sparse_fm_layer
+        """
+        first_embeddings = fluid.layers.embedding(
+            input=input, dtype='float32', size=[emb_dict_size, 1], is_sparse=True)
+        first_order = fluid.layers.sequence_pool(input=first_embeddings, pool_type='sum')
+        nonzero_embeddings = fluid.layers.embedding(
+            input=input, dtype='float32', size=[emb_dict_size, factor_size],
+            param_attr=fm_param_attr, is_sparse=True)
+        summed_features_emb = fluid.layers.sequence_pool(input=nonzero_embeddings, pool_type='sum')
+        summed_features_emb_square = fluid.layers.square(summed_features_emb)
+        squared_features_emb = fluid.layers.square(nonzero_embeddings)
+        squared_sum_features_emb = fluid.layers.sequence_pool(
+            input=squared_features_emb, pool_type='sum')
+        second_order = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
+        return first_order, second_order
+    dense_input = fluid.layers.data(name="dense_input", shape=[dense_feature_dim], dtype='float32')
    sparse_input_ids = [
-        fluid.layers.data(
+        fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
-            name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
+        for i in range(1, 27)]
-        for i in range(1, 27)
-    ]
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -24,6 +59,53 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim):
                                                      use_double_buffer=True)
    words = fluid.layers.read_file(py_reader)
+    sparse_fm_param_attr = fluid.param_attr.ParamAttr(name="SparseFeatFactors",
+                                                      initializer=fluid.initializer.Normal(
+                                                          scale=1 / math.sqrt(sparse_feature_dim)))
+    dense_fm_param_attr = fluid.param_attr.ParamAttr(name="DenseFeatFactors",
+                                                     initializer=fluid.initializer.Normal(
+                                                         scale=1 / math.sqrt(dense_feature_dim)))
+    sparse_fm_first, sparse_fm_second = sparse_fm_layer(
+        sparse_input, sparse_feature_dim, factor_size, sparse_fm_param_attr)
+    dense_fm_first, dense_fm_second = dense_fm_layer(
+        dense_input, dense_feature_dim, factor_size, dense_fm_param_attr)
+    def embedding_layer(input):
+        """embedding_layer"""
+        emb = fluid.layers.embedding(
+            input=input, dtype='float32', size=[sparse_feature_dim, factor_size],
+            param_attr=sparse_fm_param_attr, is_sparse=True)
+        return fluid.layers.sequence_pool(input=emb, pool_type='average')
+    sparse_embed_seq = map(embedding_layer, sparse_input_ids)
+    concated = fluid.layers.concat(sparse_embed_seq + [dense_input], axis=1)
+    fc1 = fluid.layers.fc(input=concated, size=400, act='relu',
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(concated.shape[1]))))
+    fc2 = fluid.layers.fc(input=fc1, size=400, act='relu',
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc1.shape[1]))))
+    fc3 = fluid.layers.fc(input=fc2, size=400, act='relu',
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc2.shape[1]))))
+    predict = fluid.layers.fc(
+        input=[sparse_fm_first, sparse_fm_second, dense_fm_first, dense_fm_second, fc3],
+        size=2,
+        act="softmax",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(fc3.shape[1]))))
+    cost = fluid.layers.cross_entropy(input=predict, label=words[-1:])
+    avg_cost = fluid.layers.reduce_sum(cost)
+    accuracy = fluid.layers.accuracy(input=predict, label=words[-1:])
+    auc_var, batch_auc_var, auc_states = \
+        fluid.layers.auc(input=predict, label=words[-1:], num_thresholds=2 ** 12, slide_steps=20)
+    return avg_cost, auc_var, batch_auc_var, py_reader
+def ctr_dnn_model(embedding_size, sparse_feature_dim):
    def embedding_layer(input):
        return fluid.layers.embedding(
            input=input,
@@ -32,24 +114,46 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim):
            # if you want to set is_distributed to True
            is_distributed=False,
            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(name="SparseFeatFactors", initializer=fluid.initializer.Uniform()))
+            param_attr=fluid.ParamAttr(name="SparseFeatFactors",
+                                       initializer=fluid.initializer.Uniform()))
+    dense_input = fluid.layers.data(
+        name="dense_input", shape=[dense_feature_dim], dtype='float32')
+    sparse_input_ids = [
+        fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
+        for i in range(1, 27)]
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    datas = [dense_input] + sparse_input_ids + [label]
+    py_reader = fluid.layers.create_py_reader_by_data(capacity=64,
+                                                      feed_list=datas,
+                                                      name='py_reader',
+                                                      use_double_buffer=True)
+    words = fluid.layers.read_file(py_reader)
    sparse_embed_seq = map(embedding_layer, words[1:-1])
    concated = fluid.layers.concat(sparse_embed_seq + words[0:1], axis=1)
    fc1 = fluid.layers.fc(input=concated, size=400, act='relu',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(concated.shape[1]))))
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(concated.shape[1]))))
    fc2 = fluid.layers.fc(input=fc1, size=400, act='relu',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc1.shape[1]))))
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc1.shape[1]))))
    fc3 = fluid.layers.fc(input=fc2, size=400, act='relu',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc2.shape[1]))))
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc2.shape[1]))))
    predict = fluid.layers.fc(input=fc3, size=2, act='softmax',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc3.shape[1]))))
+                              param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                                  scale=1 / math.sqrt(fc3.shape[1]))))
    cost = fluid.layers.cross_entropy(input=predict, label=words[-1:])
    avg_cost = fluid.layers.reduce_sum(cost)
    accuracy = fluid.layers.accuracy(input=predict, label=words[-1:])
    auc_var, batch_auc_var, auc_states = \
-        fluid.layers.auc(input=predict, label=words[-1:], num_thresholds=2**12, slide_steps=20)
+        fluid.layers.auc(input=predict, label=words[-1:], num_thresholds=2 ** 12, slide_steps=20)
    return avg_cost, auc_var, batch_auc_var, py_reader
--- a/fluid/PaddleRec/ctr/train.py
+++ b/fluid/PaddleRec/ctr/train.py
@@ -121,7 +121,7 @@ def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var,
        batch_size=args.batch_size)
    py_reader.decorate_paddle_reader(train_reader)
-    data_name_list = None
+    data_name_list = []
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)