提交 0da6629b 编写于 作者: T tangwei12

add deep fm

上级 768112b4
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ======================================================================
#
# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
#
# ======================================================================
import os
import logging
import hdfs_utils
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("cloud")
logger.setLevel(logging.INFO)
def run():
cmd = "python -u train.py "
cmd += " --train_data_path %s " % "data/train.txt"
cmd += " --test_data_path %s " % "data/test.txt"
if os.getenv("BATCH_SIZE", ""):
cmd += " --batch_size %s " % os.getenv("BATCH_SIZE")
if os.getenv("EMBEDDING_SIZE", ""):
cmd += " --embedding_size %s " % os.getenv("EMBEDDING_SIZE")
if os.getenv("NUM_PASSES", ""):
cmd += " --num_passes %s " % os.getenv("NUM_PASSES")
if os.getenv("MODEL_OUTPUT_DIR", ""):
cmd += " --model_output_dir %s " % os.getenv("MODEL_OUTPUT_DIR")
if os.getenv("SPARSE_FEATURE_DIM", ""):
cmd += " --sparse_feature_dim %s " % os.getenv("SPARSE_FEATURE_DIM")
if os.getenv("ASYNC_MODE", ""):
cmd += " --async_mode "
if os.getenv("NO_SPLIT_VAR", ""):
cmd += " --no_split_var "
is_local = int(os.getenv("PADDLE_IS_LOCAL", "1"))
if is_local:
cmd += " --is_local 1 "
cmd += " --cloud_train 0 "
else:
cmd += " --is_local 0 "
cmd += " --cloud_train 1 "
trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
trainers = int(os.environ["PADDLE_TRAINERS"])
training_role = os.environ["PADDLE_TRAINING_ROLE"]
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist)
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
if training_role == "PSERVER":
cmd += " --role pserver "
else:
cmd += " --role trainer "
cmd += " --endpoints %s " % pserver_endpoints
cmd += " --current_endpoint %s " % current_endpoint
cmd += " --trainer_id %s " % trainer_id
cmd += " --trainers %s " % trainers
logging.info("run cluster commands: {}".format(cmd))
exit(os.system(cmd))
def download():
hadoop_home = os.getenv("HADOOP_HOME")
configs = {}
configs["fs.default.name"] = os.getenv("DATA_FS_NAME")
configs["hadoop.job.ugi"] = os.getenv("DATA_FS_UGI")
client = hdfs_utils.HDFSClient(hadoop_home, configs)
local_train_data_dir = os.getenv("TRAIN_DATA_LOCAL", "data")
hdfs_train_data_dir = os.getenv("TRAIN_DATA_HDFS", "")
downloads = hdfs_utils.multi_download(client, hdfs_train_data_dir, local_train_data_dir, 0, 1, multi_processes=1)
print(downloads)
for d in downloads:
base_dir = os.path.dirname(d)
tar_cmd = "tar -zxvf {} -C {}".format(d, base_dir)
print tar_cmd
for d in downloads:
base_dir = os.path.dirname(d)
tar_cmd = "tar -zxvf {} -C {}".format(d, base_dir)
logging.info("DOWNLOAD DATA: {}, AND TAR IT: {}".format(d, tar_cmd))
os.system(tar_cmd)
def env_declar():
logging.info("******** Rename Cluster Env to PaddleFluid Env ********")
if os.environ["TRAINING_ROLE"] == "PSERVER" or os.environ["PADDLE_IS_LOCAL"] == "0":
os.environ["PADDLE_TRAINING_ROLE"] = os.environ["TRAINING_ROLE"]
os.environ["PADDLE_PSERVER_PORT"] = os.environ["PADDLE_PORT"]
os.environ["PADDLE_PSERVER_IPS"] = os.environ["PADDLE_PSERVERS"]
os.environ["PADDLE_TRAINERS"] = os.environ["PADDLE_TRAINERS_NUM"]
os.environ["PADDLE_CURRENT_IP"] = os.environ["POD_IP"]
os.environ["PADDLE_TRAINER_ID"] = os.environ["PADDLE_TRAINER_ID"]
os.environ["CPU_NUM"] = os.getenv("CPU_NUM", "12")
os.environ["NUM_THREADS"] = os.getenv("NUM_THREADS", "12")
logging.info("Content-Type: text/plain\n\n")
for key in os.environ.keys():
logging.info("%30s %s \n" % (key, os.environ[key]))
logging.info("****** Rename Cluster Env to PaddleFluid Env END ******")
if __name__ == '__main__':
env_declar()
if os.getenv("NEED_CUSTOM_DOWNLOAD", ""):
if os.environ["PADDLE_TRAINING_ROLE"] == "PSERVER":
logging.info("PSERVER do not need to download datas")
else:
logging.info("NEED_CUSTOM_DOWNLOAD is True, will download train data with hdfs_utils")
download()
run()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HDFS Utils"""
import os
import sys
import subprocess
import multiprocessing
from datetime import datetime
import re
import copy
import errno
import logging
__all__ = ["HDFSClient", "multi_download"]
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
_logger = logging.getLogger("hdfs_utils")
_logger.setLevel(logging.INFO)
class HDFSClient(object):
def __init__(self, hadoop_home, configs):
self.pre_commands = []
hadoop_bin = '%s/bin/hadoop' % hadoop_home
self.pre_commands.append(hadoop_bin)
dfs = 'fs'
self.pre_commands.append(dfs)
for k, v in configs.iteritems():
config_command = '-D%s=%s' % (k, v)
self.pre_commands.append(config_command)
def __run_hdfs_cmd(self, commands, retry_times=5):
whole_commands = copy.deepcopy(self.pre_commands)
whole_commands.extend(commands)
print('Running system command: {0}'.format(' '.join(whole_commands)))
ret_code = 0
ret_out = None
ret_err = None
for x in range(retry_times + 1):
proc = subprocess.Popen(
whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(output, errors) = proc.communicate()
ret_code, ret_out, ret_err = proc.returncode, output, errors
if ret_code:
_logger.warn(
'Times: %d, Error running command: %s. Return code: %d, Error: %s'
% (x, ' '.join(whole_commands), proc.returncode, errors))
else:
break
return ret_code, ret_out, ret_err
def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
"""
upload the local file to hdfs
args:
local_file_path: the local file path
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
return:
True or False
"""
assert hdfs_path is not None
assert local_path is not None and os.path.exists(local_path)
if os.path.isdir(local_path):
_logger.warn(
"The Local path: {} is dir and I will support it later, return".
format(local_path))
return
base = os.path.basename(local_path)
if not self.is_exist(hdfs_path):
self.makedirs(hdfs_path)
else:
if self.is_exist(os.path.join(hdfs_path, base)):
if overwrite:
_logger.error(
"The HDFS path: {} is exist and overwrite is True, delete it".
format(hdfs_path))
self.delete(hdfs_path)
else:
_logger.error(
"The HDFS path: {} is exist and overwrite is False, return".
format(hdfs_path))
return False
put_commands = ["-put", local_path, hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd(put_commands,
retry_times)
if returncode:
_logger.error("Put local path: {} to HDFS path: {} failed".format(
local_path, hdfs_path))
return False
else:
_logger.info("Put local path: {} to HDFS path: {} successfully".
format(local_path, hdfs_path))
return True
def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
"""
download from hdfs
args:
local_file_path: the local file path
remote_file_path: remote dir on hdfs
return:
True or False
"""
_logger.info('Downloading %r to %r.', hdfs_path, local_path)
_logger.info('Download of %s to %r complete.', hdfs_path, local_path)
if not self.is_exist(hdfs_path):
print("HDFS path: {} do not exist".format(hdfs_path))
return False
if self.is_dir(hdfs_path):
_logger.error(
"The HDFS path: {} is dir and I will support it later, return".
format(hdfs_path))
if os.path.exists(local_path):
base = os.path.basename(hdfs_path)
local_file = os.path.join(local_path, base)
if os.path.exists(local_file):
if overwrite:
os.remove(local_file)
else:
_logger.error(
"The Local path: {} is exist and overwrite is False, return".
format(local_file))
return False
self.make_local_dirs(local_path)
download_commands = ["-get", hdfs_path, local_path]
returncode, output, errors = self.__run_hdfs_cmd(download_commands)
if returncode:
_logger.error("Get local path: {} from HDFS path: {} failed".format(
local_path, hdfs_path))
return False
else:
_logger.info("Get local path: {} from HDFS path: {} successfully".
format(local_path, hdfs_path))
return True
def is_exist(self, hdfs_path=None):
"""
whether the remote hdfs path exists?
args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
return:
True or False
"""
exist_cmd = ['-test', '-e', hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd(
exist_cmd, retry_times=1)
if returncode:
_logger.error("HDFS is_exist HDFS path: {} failed".format(
hdfs_path))
return False
else:
_logger.info("HDFS is_exist HDFS path: {} successfully".format(
hdfs_path))
return True
def is_dir(self, hdfs_path=None):
"""
whether the remote hdfs path exists?
args:
remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
fs_name: The default values are the same as in the job configuration
fs_ugi: The default values are the same as in the job configuration
return:
True or False
"""
if not self.is_exist(hdfs_path):
return False
dir_cmd = ['-test', '-d', hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
if returncode:
_logger.error("HDFS path: {} failed is not a directory".format(
hdfs_path))
return False
else:
_logger.info("HDFS path: {} successfully is a directory".format(
hdfs_path))
return True
def delete(self, hdfs_path):
"""Remove a file or directory from HDFS.
:param hdfs_path: HDFS path.
:param recursive: Recursively delete files and directories. By default,
this method will raise an :class:`HdfsError` if trying to delete a
non-empty directory.
This function returns `True` if the deletion was successful and `False` if
no file or directory previously existed at `hdfs_path`.
"""
_logger.info('Deleting %r.', hdfs_path)
if not self.is_exist(hdfs_path):
_logger.warn("HDFS path: {} do not exist".format(hdfs_path))
return True
if self.is_dir(hdfs_path):
del_cmd = ['-rmr', hdfs_path]
else:
del_cmd = ['-rm', hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
if returncode:
_logger.error("HDFS path: {} delete files failure".format(
hdfs_path))
return False
else:
_logger.info("HDFS path: {} delete files successfully".format(
hdfs_path))
return True
def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
"""Move a file or folder.
:param hdfs_src_path: Source path.
:param hdfs_dst_path: Destination path. If the path already exists and is
a directory, the source will be moved into it. If the path exists and is
a file, or if a parent destination directory is missing, this method will
raise an :class:`HdfsError`.
"""
assert hdfs_src_path is not None
assert hdfs_dst_path is not None
if not self.is_exist(hdfs_src_path):
_logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
if self.is_exist(hdfs_dst_path) and not overwrite:
_logger.error("HDFS path is exist: {} and overwrite=False".format(
hdfs_dst_path))
rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
returncode, output, errors = self.__run_hdfs_cmd(
rename_command, retry_times=1)
if returncode:
_logger.error("HDFS rename path: {} to {} failed".format(
hdfs_src_path, hdfs_dst_path))
return False
else:
_logger.info("HDFS rename path: {} to {} successfully".format(
hdfs_src_path, hdfs_dst_path))
return True
@staticmethod
def make_local_dirs(local_path):
try:
os.makedirs(local_path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def makedirs(self, hdfs_path):
"""Create a remote directory, recursively if necessary.
:param hdfs_path: Remote path. Intermediate directories will be created
appropriately.
"""
_logger.info('Creating directories to %r.', hdfs_path)
assert hdfs_path is not None
if self.is_exist(hdfs_path):
_logger.error("HDFS path is exist: {}".format(hdfs_path))
return
mkdirs_commands = ['-mkdir', hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd(
mkdirs_commands, retry_times=1)
if returncode:
_logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
return False
else:
_logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
return True
def ls(self, hdfs_path):
assert hdfs_path is not None
if not self.is_exist(hdfs_path):
return []
ls_commands = ['-ls', hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd(
ls_commands, retry_times=1)
if returncode:
_logger.error("HDFS list path: {} failed".format(hdfs_path))
return []
else:
_logger.info("HDFS list path: {} successfully".format(hdfs_path))
ret_lines = []
regex = re.compile('\s+')
out_lines = output.strip().split("\n")
for line in out_lines:
re_line = regex.split(line)
if len(re_line) == 8:
ret_lines.append(re_line[7])
return ret_lines
def lsr(self, hdfs_path, only_file=True, sort=True):
def sort_by_time(v1, v2):
v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
return v1_time > v2_time
assert hdfs_path is not None
if not self.is_exist(hdfs_path):
return []
ls_commands = ['-lsr', hdfs_path]
returncode, output, errors = self.__run_hdfs_cmd(
ls_commands, retry_times=1)
if returncode:
_logger.error("HDFS list all files: {} failed".format(hdfs_path))
return []
else:
_logger.info("HDFS list all files: {} successfully".format(
hdfs_path))
lines = []
regex = re.compile('\s+')
out_lines = output.strip().split("\n")
for line in out_lines:
re_line = regex.split(line)
if len(re_line) == 8:
if only_file and re_line[0][0] == "d":
continue
else:
lines.append((re_line[7], re_line[5] + " " + re_line[6]))
if sort:
sorted(lines, cmp=sort_by_time)
ret_lines = [ret[0] for ret in lines]
return ret_lines
def multi_download(client,
hdfs_path,
local_path,
trainer_id,
trainers,
multi_processes=5):
"""
multi_download
:param client: instance of HDFSClient
:param hdfs_path: path on hdfs
:param local_path: path on local
:param trainer_id: current trainer id
:param trainers: all trainers number
:param multi_processes: the download data process at the same time, default=5
:return: None
"""
def __subprocess_download(datas):
for data in datas:
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
local_re_path = os.path.join(local_path, re_path)
client.download(data, local_re_path)
assert isinstance(client, HDFSClient)
client.make_local_dirs(local_path)
_logger.info("Make local dir {} successfully".format(local_path))
all_need_download = client.lsr(hdfs_path, sort=True)
need_download = all_need_download[trainer_id::trainers]
_logger.info("Get {} files From all {} files need to be download from {}".
format(len(need_download), len(all_need_download), hdfs_path))
_logger.info("Start {} multi process to download datas".format(
multi_processes))
procs = []
for i in range(multi_processes):
process_datas = need_download[i::multi_processes]
p = multiprocessing.Process(
target=__subprocess_download, args=(process_datas,))
procs.append(p)
p.start()
# complete the processes
for proc in procs:
proc.join()
_logger.info("Finish {} multi process to download datas".format(
multi_processes))
local_downloads = []
for data in need_download:
data_name = os.path.basename(data)
re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
if re_path == ".":
local_re_path = os.path.join(local_path, data_name)
else:
local_re_path = os.path.join(local_path, re_path, data_name)
local_downloads.append(local_re_path)
return local_downloads
def getfilelist(path):
rlist = []
for dir,folder,file in os.walk(path):
for i in file:
t = os.path.join(dir,i)
rlist.append(t)
for r in rlist:
print(r)
def multi_upload(client,
hdfs_path,
local_path,
multi_processes=5,
overwrite=False,
sync=True):
"""
:param client:
:param hdfs_path:
:param local_path:
:param sync:
:return:
"""
def __subprocess_upload(datas):
for data in datas:
re_path = os.path.relpath(os.path.dirname(data), local_path)
hdfs_re_path = os.path.join(hdfs_path, re_path)
client.upload(hdfs_re_path, data, overwrite, retry_times=5)
def get_local_files(path):
rlist = []
if not os.path.isdir(path):
return rlist
for dirname, folder, files in os.walk(path):
for i in files:
t = os.path.join(dirname, i)
rlist.append(t)
return rlist
assert isinstance(client, HDFSClient)
all_files = get_local_files(local_path)
if not all_files:
_logger.info("there are nothing need to upload, exit")
return
_logger.info("Start {} multi process to upload datas".format(multi_processes))
procs = []
for i in range(multi_processes):
process_datas = all_files[i::multi_processes]
p = multiprocessing.Process(
target=__subprocess_upload, args=(process_datas,))
procs.append(p)
p.start()
# complete the processes
for proc in procs:
proc.join()
_logger.info("Finish {} multi process to upload datas".format(
multi_processes))
if __name__ == "__main__":
pass
\ No newline at end of file
......@@ -4,15 +4,50 @@ import math
dense_feature_dim = 13
def ctr_dnn_model(embedding_size, sparse_feature_dim):
dense_input = fluid.layers.data(
name="dense_input", shape=[dense_feature_dim], dtype='float32')
def ctr_deepfm_model(factor_size, sparse_feature_dim, dense_feature_dim, sparse_input):
def dense_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
"""
dense_fm_layer
"""
first_order = fluid.layers.fc(input=input, size=1)
emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
dtype='float32', attr=fm_param_attr)
input_mul_factor = fluid.layers.matmul(input, emb_table)
input_mul_factor_square = fluid.layers.square(input_mul_factor)
input_square = fluid.layers.square(input)
factor_square = fluid.layers.square(emb_table)
input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
second_order = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
return first_order, second_order
def sparse_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
"""
sparse_fm_layer
"""
first_embeddings = fluid.layers.embedding(
input=input, dtype='float32', size=[emb_dict_size, 1], is_sparse=True)
first_order = fluid.layers.sequence_pool(input=first_embeddings, pool_type='sum')
nonzero_embeddings = fluid.layers.embedding(
input=input, dtype='float32', size=[emb_dict_size, factor_size],
param_attr=fm_param_attr, is_sparse=True)
summed_features_emb = fluid.layers.sequence_pool(input=nonzero_embeddings, pool_type='sum')
summed_features_emb_square = fluid.layers.square(summed_features_emb)
squared_features_emb = fluid.layers.square(nonzero_embeddings)
squared_sum_features_emb = fluid.layers.sequence_pool(
input=squared_features_emb, pool_type='sum')
second_order = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
return first_order, second_order
dense_input = fluid.layers.data(name="dense_input", shape=[dense_feature_dim], dtype='float32')
sparse_input_ids = [
fluid.layers.data(
name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
for i in range(1, 27)
]
fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
for i in range(1, 27)]
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
......@@ -24,6 +59,53 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim):
use_double_buffer=True)
words = fluid.layers.read_file(py_reader)
sparse_fm_param_attr = fluid.param_attr.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(sparse_feature_dim)))
dense_fm_param_attr = fluid.param_attr.ParamAttr(name="DenseFeatFactors",
initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(dense_feature_dim)))
sparse_fm_first, sparse_fm_second = sparse_fm_layer(
sparse_input, sparse_feature_dim, factor_size, sparse_fm_param_attr)
dense_fm_first, dense_fm_second = dense_fm_layer(
dense_input, dense_feature_dim, factor_size, dense_fm_param_attr)
def embedding_layer(input):
"""embedding_layer"""
emb = fluid.layers.embedding(
input=input, dtype='float32', size=[sparse_feature_dim, factor_size],
param_attr=sparse_fm_param_attr, is_sparse=True)
return fluid.layers.sequence_pool(input=emb, pool_type='average')
sparse_embed_seq = map(embedding_layer, sparse_input_ids)
concated = fluid.layers.concat(sparse_embed_seq + [dense_input], axis=1)
fc1 = fluid.layers.fc(input=concated, size=400, act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(concated.shape[1]))))
fc2 = fluid.layers.fc(input=fc1, size=400, act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc1.shape[1]))))
fc3 = fluid.layers.fc(input=fc2, size=400, act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc2.shape[1]))))
predict = fluid.layers.fc(
input=[sparse_fm_first, sparse_fm_second, dense_fm_first, dense_fm_second, fc3],
size=2,
act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(fc3.shape[1]))))
cost = fluid.layers.cross_entropy(input=predict, label=words[-1:])
avg_cost = fluid.layers.reduce_sum(cost)
accuracy = fluid.layers.accuracy(input=predict, label=words[-1:])
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=words[-1:], num_thresholds=2 ** 12, slide_steps=20)
return avg_cost, auc_var, batch_auc_var, py_reader
def ctr_dnn_model(embedding_size, sparse_feature_dim):
def embedding_layer(input):
return fluid.layers.embedding(
input=input,
......@@ -32,24 +114,46 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim):
# if you want to set is_distributed to True
is_distributed=False,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(name="SparseFeatFactors", initializer=fluid.initializer.Uniform()))
param_attr=fluid.ParamAttr(name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
dense_input = fluid.layers.data(
name="dense_input", shape=[dense_feature_dim], dtype='float32')
sparse_input_ids = [
fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
for i in range(1, 27)]
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
datas = [dense_input] + sparse_input_ids + [label]
py_reader = fluid.layers.create_py_reader_by_data(capacity=64,
feed_list=datas,
name='py_reader',
use_double_buffer=True)
words = fluid.layers.read_file(py_reader)
sparse_embed_seq = map(embedding_layer, words[1:-1])
concated = fluid.layers.concat(sparse_embed_seq + words[0:1], axis=1)
fc1 = fluid.layers.fc(input=concated, size=400, act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(concated.shape[1]))))
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(concated.shape[1]))))
fc2 = fluid.layers.fc(input=fc1, size=400, act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc1.shape[1]))))
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc1.shape[1]))))
fc3 = fluid.layers.fc(input=fc2, size=400, act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc2.shape[1]))))
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc2.shape[1]))))
predict = fluid.layers.fc(input=fc3, size=2, act='softmax',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc3.shape[1]))))
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fc3.shape[1]))))
cost = fluid.layers.cross_entropy(input=predict, label=words[-1:])
avg_cost = fluid.layers.reduce_sum(cost)
accuracy = fluid.layers.accuracy(input=predict, label=words[-1:])
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=words[-1:], num_thresholds=2**12, slide_steps=20)
fluid.layers.auc(input=predict, label=words[-1:], num_thresholds=2 ** 12, slide_steps=20)
return avg_cost, auc_var, batch_auc_var, py_reader
......@@ -121,7 +121,7 @@ def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var,
batch_size=args.batch_size)
py_reader.decorate_paddle_reader(train_reader)
data_name_list = None
data_name_list = []
place = fluid.CPUPlace()
exe = fluid.Executor(place)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册