提交 53c664a9 编写于 作者: C chengmo

fix clnflict

repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: v1.0.1
hooks:
- id: remove-crlf
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- id: end-of-file-fixer
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
language: generic
sudo: required
dist: trusty
services:
- docker
os:
- linux
env:
- JOB=check_style
before_install:
# For pylint dockstring checker
- sudo pip install pylint pytest astroid isort pre-commit
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- "travis_wait 30 sleep 1800 &"
- |
# 43min timeout
tools/build_script.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
notifications:
email:
on_success: change
on_failure: always
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -27,6 +27,7 @@ from paddlerec.core.utils import envs ...@@ -27,6 +27,7 @@ from paddlerec.core.utils import envs
class ClusterEngine(Engine): class ClusterEngine(Engine):
def __init_impl__(self): def __init_impl__(self):
abs_dir = os.path.dirname(os.path.abspath(__file__)) abs_dir = os.path.dirname(os.path.abspath(__file__))
backend = envs.get_runtime_environ("engine_backend") backend = envs.get_runtime_environ("engine_backend")
if backend == "PaddleCloud": if backend == "PaddleCloud":
self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
...@@ -57,4 +58,5 @@ class ClusterEngine(Engine): ...@@ -57,4 +58,5 @@ class ClusterEngine(Engine):
self.start_worker_procs() self.start_worker_procs()
else: else:
raise ValueError("role {} error, must in MASTER/WORKER".format(role)) raise ValueError("role {} error, must in MASTER/WORKER".format(
role))
...@@ -46,10 +46,13 @@ class LocalClusterEngine(Engine): ...@@ -46,10 +46,13 @@ class LocalClusterEngine(Engine):
ports.append(new_port) ports.append(new_port)
break break
user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
user_endpoints_ips = [x.split(":")[0]
for x in user_endpoints.split(",")] user_endpoints_ips = [
user_endpoints_port = [x.split(":")[1] x.split(":")[0] for x in user_endpoints.split(",")
for x in user_endpoints.split(",")] ]
user_endpoints_port = [
x.split(":")[1] for x in user_endpoints.split(",")
]
factory = "paddlerec.core.factory" factory = "paddlerec.core.factory"
cmd = [sys.executable, "-u", "-m", factory, self.trainer] cmd = [sys.executable, "-u", "-m", factory, self.trainer]
...@@ -97,8 +100,10 @@ class LocalClusterEngine(Engine): ...@@ -97,8 +100,10 @@ class LocalClusterEngine(Engine):
if len(log_fns) > 0: if len(log_fns) > 0:
log_fns[i].close() log_fns[i].close()
procs[i].terminate() procs[i].terminate()
print("all workers already completed, you can view logs under the `{}` directory".format(logs_dir), print(
file=sys.stderr) "all workers already completed, you can view logs under the `{}` directory".
format(logs_dir),
file=sys.stderr)
def run(self): def run(self):
self.start_procs() self.start_procs()
...@@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine ...@@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine
class LocalMPIEngine(Engine): class LocalMPIEngine(Engine):
def start_procs(self): def start_procs(self):
logs_dir = self.envs["log_dir"] logs_dir = self.envs["log_dir"]
default_env = os.environ.copy() default_env = os.environ.copy()
current_env = copy.copy(default_env) current_env = copy.copy(default_env)
current_env.pop("http_proxy", None) current_env.pop("http_proxy", None)
...@@ -42,7 +41,8 @@ class LocalMPIEngine(Engine): ...@@ -42,7 +41,8 @@ class LocalMPIEngine(Engine):
os.system("mkdir -p {}".format(logs_dir)) os.system("mkdir -p {}".format(logs_dir))
fn = open("%s/job.log" % logs_dir, "w") fn = open("%s/job.log" % logs_dir, "w")
log_fns.append(fn) log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) proc = subprocess.Popen(
cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
else: else:
proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd()) proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd())
procs.append(proc) procs.append(proc)
...@@ -51,7 +51,9 @@ class LocalMPIEngine(Engine): ...@@ -51,7 +51,9 @@ class LocalMPIEngine(Engine):
if len(log_fns) > 0: if len(log_fns) > 0:
log_fns[i].close() log_fns[i].close()
procs[i].wait() procs[i].wait()
print("all workers and parameter servers already completed", file=sys.stderr) print(
"all workers and parameter servers already completed",
file=sys.stderr)
def run(self): def run(self):
self.start_procs() self.start_procs()
...@@ -19,24 +19,23 @@ import yaml ...@@ -19,24 +19,23 @@ import yaml
from paddlerec.core.utils import envs from paddlerec.core.utils import envs
trainer_abs = os.path.join(os.path.dirname( trainer_abs = os.path.join(
os.path.abspath(__file__)), "trainers") os.path.dirname(os.path.abspath(__file__)), "trainers")
trainers = {} trainers = {}
def trainer_registry(): def trainer_registry():
trainers["SingleTrainer"] = os.path.join( trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py")
trainer_abs, "single_trainer.py") trainers["ClusterTrainer"] = os.path.join(trainer_abs,
trainers["ClusterTrainer"] = os.path.join( "cluster_trainer.py")
trainer_abs, "cluster_trainer.py") trainers["CtrCodingTrainer"] = os.path.join(trainer_abs,
trainers["CtrCodingTrainer"] = os.path.join( "ctr_coding_trainer.py")
trainer_abs, "ctr_coding_trainer.py") trainers["CtrModulTrainer"] = os.path.join(trainer_abs,
trainers["CtrModulTrainer"] = os.path.join( "ctr_modul_trainer.py")
trainer_abs, "ctr_modul_trainer.py") trainers["TDMSingleTrainer"] = os.path.join(trainer_abs,
trainers["TDMSingleTrainer"] = os.path.join( "tdm_single_trainer.py")
trainer_abs, "tdm_single_trainer.py") trainers["TDMClusterTrainer"] = os.path.join(trainer_abs,
trainers["TDMClusterTrainer"] = os.path.join( "tdm_cluster_trainer.py")
trainer_abs, "tdm_cluster_trainer.py")
trainer_registry() trainer_registry()
...@@ -55,8 +54,8 @@ class TrainerFactory(object): ...@@ -55,8 +54,8 @@ class TrainerFactory(object):
if trainer_abs is None: if trainer_abs is None:
if not os.path.isfile(train_mode): if not os.path.isfile(train_mode):
raise IOError( raise IOError("trainer {} can not be recognized".format(
"trainer {} can not be recognized".format(train_mode)) train_mode))
trainer_abs = train_mode trainer_abs = train_mode
train_mode = "UserDefineTrainer" train_mode = "UserDefineTrainer"
......
...@@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric ...@@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric
class AUCMetric(Metric): class AUCMetric(Metric):
""" """
Metric For Paddle Model Metric For Fluid Model
""" """
def __init__(self, config, fleet): def __init__(self, config, fleet):
...@@ -83,7 +83,8 @@ class AUCMetric(Metric): ...@@ -83,7 +83,8 @@ class AUCMetric(Metric):
if scope.find_var(metric_item['var'].name) is None: if scope.find_var(metric_item['var'].name) is None:
result[metric_name] = None result[metric_name] = None
continue continue
result[metric_name] = self.get_metric(scope, metric_item['var'].name) result[metric_name] = self.get_metric(scope,
metric_item['var'].name)
return result return result
def calculate_auc(self, global_pos, global_neg): def calculate_auc(self, global_pos, global_neg):
...@@ -178,14 +179,18 @@ class AUCMetric(Metric): ...@@ -178,14 +179,18 @@ class AUCMetric(Metric):
self._result['mean_q'] = 0 self._result['mean_q'] = 0
return self._result return self._result
if 'stat_pos' in result and 'stat_neg' in result: if 'stat_pos' in result and 'stat_neg' in result:
result['auc'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) result['auc'] = self.calculate_auc(result['stat_pos'],
result['bucket_error'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
if 'pos_ins_num' in result: if 'pos_ins_num' in result:
result['actual_ctr'] = result['pos_ins_num'] / result['total_ins_num'] result['actual_ctr'] = result['pos_ins_num'] / result[
'total_ins_num']
if 'abserr' in result: if 'abserr' in result:
result['mae'] = result['abserr'] / result['total_ins_num'] result['mae'] = result['abserr'] / result['total_ins_num']
if 'sqrerr' in result: if 'sqrerr' in result:
result['rmse'] = math.sqrt(result['sqrerr'] / result['total_ins_num']) result['rmse'] = math.sqrt(result['sqrerr'] /
result['total_ins_num'])
if 'prob' in result: if 'prob' in result:
result['predict_ctr'] = result['prob'] / result['total_ins_num'] result['predict_ctr'] = result['prob'] / result['total_ins_num']
if abs(result['predict_ctr']) > 1e-6: if abs(result['predict_ctr']) > 1e-6:
......
...@@ -20,7 +20,7 @@ from paddlerec.core.utils import envs ...@@ -20,7 +20,7 @@ from paddlerec.core.utils import envs
class Model(object): class Model(object):
"""R """Base Model
""" """
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
...@@ -38,6 +38,45 @@ class Model(object): ...@@ -38,6 +38,45 @@ class Model(object):
self._namespace = "train.model" self._namespace = "train.model"
self._platform = envs.get_platform() self._platform = envs.get_platform()
def _init_slots(self):
sparse_slots = envs.get_global_env("sparse_slots", None,
"train.reader")
dense_slots = envs.get_global_env("dense_slots", None, "train.reader")
if sparse_slots is not None or dense_slots is not None:
sparse_slots = sparse_slots.strip().split(" ")
dense_slots = dense_slots.strip().split(" ")
dense_slots_shape = [[
int(j) for j in i.split(":")[1].strip("[]").split(",")
] for i in dense_slots]
dense_slots = [i.split(":")[0] for i in dense_slots]
self._dense_data_var = []
for i in range(len(dense_slots)):
l = fluid.layers.data(
name=dense_slots[i],
shape=dense_slots_shape[i],
dtype="float32")
self._data_var.append(l)
self._dense_data_var.append(l)
self._sparse_data_var = []
for name in sparse_slots:
l = fluid.layers.data(
name=name, shape=[1], lod_level=1, dtype="int64")
self._data_var.append(l)
self._sparse_data_var.append(l)
dataset_class = envs.get_global_env("dataset_class", None,
"train.reader")
if dataset_class == "DataLoader":
self._init_dataloader()
def _init_dataloader(self):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_inputs(self): def get_inputs(self):
return self._data_var return self._data_var
...@@ -68,8 +107,8 @@ class Model(object): ...@@ -68,8 +107,8 @@ class Model(object):
"configured optimizer can only supported SGD/Adam/Adagrad") "configured optimizer can only supported SGD/Adam/Adagrad")
if name == "SGD": if name == "SGD":
reg = envs.get_global_env( reg = envs.get_global_env("hyper_parameters.reg", 0.0001,
"hyper_parameters.reg", 0.0001, self._namespace) self._namespace)
optimizer_i = fluid.optimizer.SGD( optimizer_i = fluid.optimizer.SGD(
lr, regularization=fluid.regularizer.L2DecayRegularizer(reg)) lr, regularization=fluid.regularizer.L2DecayRegularizer(reg))
elif name == "ADAM": elif name == "ADAM":
...@@ -83,10 +122,10 @@ class Model(object): ...@@ -83,10 +122,10 @@ class Model(object):
return optimizer_i return optimizer_i
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env( learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
"hyper_parameters.learning_rate", None, self._namespace) None, self._namespace)
optimizer = envs.get_global_env( optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
"hyper_parameters.optimizer", None, self._namespace) self._namespace)
print(">>>>>>>>>>>.learnig rate: %s" % learning_rate) print(">>>>>>>>>>>.learnig rate: %s" % learning_rate)
return self._build_optimizer(optimizer, learning_rate) return self._build_optimizer(optimizer, learning_rate)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -31,6 +31,7 @@ def create(config): ...@@ -31,6 +31,7 @@ def create(config):
Model Instance Model Instance
""" """
model = None model = None
if config['mode'] == 'fluid': if config['mode'] == 'fluid':
model = YamlModel(config) model = YamlModel(config)
model.train_net() model.train_net()
...@@ -50,7 +51,12 @@ class YamlModel(Model): ...@@ -50,7 +51,12 @@ class YamlModel(Model):
f = open(config['layer_file'], 'r') f = open(config['layer_file'], 'r')
self._build_nodes = yaml.safe_load(f.read()) self._build_nodes = yaml.safe_load(f.read())
self._build_phase = ['input', 'param', 'summary', 'layer'] self._build_phase = ['input', 'param', 'summary', 'layer']
self._build_param = {'layer': {}, 'inner_layer': {}, 'layer_extend': {}, 'model': {}} self._build_param = {
'layer': {},
'inner_layer': {},
'layer_extend': {},
'model': {}
}
self._inference_meta = {'dependency': {}, 'params': {}} self._inference_meta = {'dependency': {}, 'params': {}}
def train_net(self): def train_net(self):
...@@ -76,10 +82,12 @@ class YamlModel(Model): ...@@ -76,10 +82,12 @@ class YamlModel(Model):
if self._build_nodes[phase] is None: if self._build_nodes[phase] is None:
continue continue
for node in self._build_nodes[phase]: for node in self._build_nodes[phase]:
exec("""layer=layer.{}(node)""".format(node['class'])) exec ("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(self._config['mode'], self._build_param) layer_output, extend_output = layer.generate(
self._config['mode'], self._build_param)
self._build_param['layer'][node['name']] = layer_output self._build_param['layer'][node['name']] = layer_output
self._build_param['layer_extend'][node['name']] = extend_output self._build_param['layer_extend'][node[
'name']] = extend_output
if extend_output is None: if extend_output is None:
continue continue
if 'loss' in extend_output: if 'loss' in extend_output:
...@@ -89,17 +97,24 @@ class YamlModel(Model): ...@@ -89,17 +97,24 @@ class YamlModel(Model):
self._cost += extend_output['loss'] self._cost += extend_output['loss']
if 'data_var' in extend_output: if 'data_var' in extend_output:
self._data_var += extend_output['data_var'] self._data_var += extend_output['data_var']
if 'metric_label' in extend_output and extend_output['metric_label'] is not None: if 'metric_label' in extend_output and extend_output[
self._metrics[extend_output['metric_label']] = extend_output['metric_dict'] 'metric_label'] is not None:
self._metrics[extend_output[
'metric_label']] = extend_output['metric_dict']
if 'inference_param' in extend_output: if 'inference_param' in extend_output:
inference_param = extend_output['inference_param'] inference_param = extend_output['inference_param']
param_name = inference_param['name'] param_name = inference_param['name']
if param_name not in self._build_param['table']: if param_name not in self._build_param['table']:
self._build_param['table'][param_name] = {'params': []} self._build_param['table'][param_name] = {
table_meta = table.TableMeta.alloc_new_table(inference_param['table_id']) 'params': []
self._build_param['table'][param_name]['_meta'] = table_meta }
self._build_param['table'][param_name]['params'] += inference_param['params'] table_meta = table.TableMeta.alloc_new_table(
inference_param['table_id'])
self._build_param['table'][param_name][
'_meta'] = table_meta
self._build_param['table'][param_name][
'params'] += inference_param['params']
pass pass
@classmethod @classmethod
...@@ -114,20 +129,25 @@ class YamlModel(Model): ...@@ -114,20 +129,25 @@ class YamlModel(Model):
metrics = params['metrics'] metrics = params['metrics']
for name in metrics: for name in metrics:
model_metrics = metrics[name] model_metrics = metrics[name]
stat_var_names += [model_metrics[metric]['var'].name for metric in model_metrics] stat_var_names += [
model_metrics[metric]['var'].name
for metric in model_metrics
]
strategy['stat_var_names'] = list(set(stat_var_names)) strategy['stat_var_names'] = list(set(stat_var_names))
optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \ optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \
'(learning_rate=' + str(optimizer_conf['learning_rate']) + ')' '(learning_rate=' + str(optimizer_conf['learning_rate']) + ')'
exec(optimizer_generator) exec (optimizer_generator)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
return optimizer return optimizer
def dump_model_program(self, path): def dump_model_program(self, path):
"""R """R
""" """
with open(path + '/' + self._name + '_main_program.pbtxt', "w") as fout: with open(path + '/' + self._name + '_main_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['train_program'] print >> fout, self._build_param['model']['train_program']
with open(path + '/' + self._name + '_startup_program.pbtxt', "w") as fout: with open(path + '/' + self._name + '_startup_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['startup_program'] print >> fout, self._build_param['model']['startup_program']
pass pass
...@@ -137,7 +157,8 @@ class YamlModel(Model): ...@@ -137,7 +157,8 @@ class YamlModel(Model):
scope = params['scope'] scope = params['scope']
decay = params['decay'] decay = params['decay']
for param_table in self._build_param['table']: for param_table in self._build_param['table']:
table_id = self._build_param['table'][param_table]['_meta']._table_id table_id = self._build_param['table'][param_table][
'_meta']._table_id
fleet.shrink_dense_table(decay, scope=scope, table_id=table_id) fleet.shrink_dense_table(decay, scope=scope, table_id=table_id)
def dump_inference_program(self, inference_layer, path): def dump_inference_program(self, inference_layer, path):
...@@ -152,17 +173,25 @@ class YamlModel(Model): ...@@ -152,17 +173,25 @@ class YamlModel(Model):
executor = params['executor'] executor = params['executor']
program = self._build_param['model']['train_program'] program = self._build_param['model']['train_program']
for table_name, table in self._build_param['table'].items(): for table_name, table in self._build_param['table'].items():
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, table['params']) fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id,
table['params'])
for infernce_item in params['inference_list']: for infernce_item in params['inference_list']:
params_name_list = self.inference_params(infernce_item['layer_name']) params_name_list = self.inference_params(infernce_item[
params_var_list = [program.global_block().var(i) for i in params_name_list] 'layer_name'])
params_var_list = [
program.global_block().var(i) for i in params_name_list
]
params_file_name = infernce_item['save_file_name'] params_file_name = infernce_item['save_file_name']
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
if params['save_combine']: if params['save_combine']:
fluid.io.save_vars(executor, "./", \ fluid.io.save_vars(executor, "./", \
program, vars=params_var_list, filename=params_file_name) program, vars=params_var_list, filename=params_file_name)
else: else:
fluid.io.save_vars(executor, params_file_name, program, vars=params_var_list) fluid.io.save_vars(
executor,
params_file_name,
program,
vars=params_var_list)
def inference_params(self, inference_layer): def inference_params(self, inference_layer):
""" """
...@@ -177,11 +206,13 @@ class YamlModel(Model): ...@@ -177,11 +206,13 @@ class YamlModel(Model):
return self._inference_meta['params'][layer] return self._inference_meta['params'][layer]
self._inference_meta['params'][layer] = [] self._inference_meta['params'][layer] = []
self._inference_meta['dependency'][layer] = self.get_dependency(self._build_param['inner_layer'], layer) self._inference_meta['dependency'][layer] = self.get_dependency(
self._build_param['inner_layer'], layer)
for node in self._build_nodes['layer']: for node in self._build_nodes['layer']:
if node['name'] not in self._inference_meta['dependency'][layer]: if node['name'] not in self._inference_meta['dependency'][layer]:
continue continue
if 'inference_param' in self._build_param['layer_extend'][node['name']]: if 'inference_param' in self._build_param['layer_extend'][node[
'name']]:
self._inference_meta['params'][layer] += \ self._inference_meta['params'][layer] += \
self._build_param['layer_extend'][node['name']]['inference_param']['params'] self._build_param['layer_extend'][node['name']]['inference_param']['params']
return self._inference_meta['params'][layer] return self._inference_meta['params'][layer]
...@@ -199,5 +230,6 @@ class YamlModel(Model): ...@@ -199,5 +230,6 @@ class YamlModel(Model):
dependencys = copy.deepcopy(layer_graph[dest_layer]['input']) dependencys = copy.deepcopy(layer_graph[dest_layer]['input'])
dependency_list = copy.deepcopy(dependencys) dependency_list = copy.deepcopy(dependencys)
for dependency in dependencys: for dependency in dependencys:
dependency_list = dependency_list + self.get_dependency(layer_graph, dependency) dependency_list = dependency_list + self.get_dependency(
layer_graph, dependency)
return list(set(dependency_list)) return list(set(dependency_list))
...@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer ...@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer
class EmbeddingFuseLayer(Layer): class EmbeddingFuseLayer(Layer):
"""R """embedding + sequence + concat
""" """
def __init__(self, config): def __init__(self, config):
...@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer): ...@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer):
show_clk.stop_gradient = True show_clk.stop_gradient = True
data_var = [] data_var = []
for slot in self._slots: for slot in self._slots:
l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) l = fluid.layers.data(
name=slot, shape=[1], dtype="int64", lod_level=1)
data_var.append(l) data_var.append(l)
emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \ emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \
is_sparse=True, is_distributed=True, is_sparse=True, is_distributed=True,
...@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer): ...@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer):
emb = fluid.layers.sequence_pool(input=emb, pool_type='sum') emb = fluid.layers.sequence_pool(input=emb, pool_type='sum')
emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm) emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm)
self._emb_layers.append(emb) self._emb_layers.append(emb)
output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name) output = fluid.layers.concat(
input=self._emb_layers, axis=1, name=self._name)
return output, {'data_var': data_var} return output, {'data_var': data_var}
...@@ -111,7 +113,13 @@ class ParamLayer(Layer): ...@@ -111,7 +113,13 @@ class ParamLayer(Layer):
def generate(self, param): def generate(self, param):
"""R """R
""" """
return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}} return self._config, {
'inference_param': {
'name': 'param',
'params': [],
'table_id': self._table_id
}
}
class SummaryLayer(Layer): class SummaryLayer(Layer):
...@@ -129,10 +137,16 @@ class SummaryLayer(Layer): ...@@ -129,10 +137,16 @@ class SummaryLayer(Layer):
def generate(self, param): def generate(self, param):
"""R """R
""" """
return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}} return self._config, {
'inference_param': {
'name': 'summary',
'params': [],
'table_id': self._table_id
}
}
class NormalizetionLayer(Layer): class NormalizationLayer(Layer):
"""R """R
""" """
...@@ -152,9 +166,19 @@ class NormalizetionLayer(Layer): ...@@ -152,9 +166,19 @@ class NormalizetionLayer(Layer):
if len(self._input) > 0: if len(self._input) > 0:
input_list = [param['layer'][i] for i in self._input] input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1) input_layer = fluid.layers.concat(input=input_list, axis=1)
bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={ bn = fluid.layers.data_norm(
"batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4}) input=input_layer,
inference_param = [self._name + '.batch_size', self._name + '.batch_sum', self._name + '.batch_square_sum'] name=self._name,
epsilon=1e-4,
param_attr={
"batch_size": 1e4,
"batch_sum_default": 0.0,
"batch_square": 1e4
})
inference_param = [
self._name + '.batch_size', self._name + '.batch_sum',
self._name + '.batch_square_sum'
]
return bn, {'inference_param': {'name': 'summary', \ return bn, {'inference_param': {'name': 'summary', \
'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}} 'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}}
...@@ -181,11 +205,13 @@ class FCLayer(Layer): ...@@ -181,11 +205,13 @@ class FCLayer(Layer):
input_list = [param['layer'][i] for i in self._input] input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1) input_layer = fluid.layers.concat(input=input_list, axis=1)
input_coln = input_layer.shape[1] input_coln = input_layer.shape[1]
scale = param_layer['init_range'] / (input_coln ** 0.5) scale = param_layer['init_range'] / (input_coln**0.5)
bias = None bias = None
if self._bias: if self._bias:
bias = fluid.ParamAttr(learning_rate=1.0, bias = fluid.ParamAttr(
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale)) learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale))
fc = fluid.layers.fc( fc = fluid.layers.fc(
name=self._name, name=self._name,
input=input_layer, input=input_layer,
...@@ -216,18 +242,46 @@ class LogLossLayer(Layer): ...@@ -216,18 +242,46 @@ class LogLossLayer(Layer):
self._extend_output = { self._extend_output = {
'metric_label': self._metric_label, 'metric_label': self._metric_label,
'metric_dict': { 'metric_dict': {
'auc': {'var': None}, 'auc': {
'batch_auc': {'var': None}, 'var': None
'stat_pos': {'var': None, 'data_type': 'int64'}, },
'stat_neg': {'var': None, 'data_type': 'int64'}, 'batch_auc': {
'batch_stat_pos': {'var': None, 'data_type': 'int64'}, 'var': None
'batch_stat_neg': {'var': None, 'data_type': 'int64'}, },
'pos_ins_num': {'var': None}, 'stat_pos': {
'abserr': {'var': None}, 'var': None,
'sqrerr': {'var': None}, 'data_type': 'int64'
'prob': {'var': None}, },
'total_ins_num': {'var': None}, 'stat_neg': {
'q': {'var': None} 'var': None,
'data_type': 'int64'
},
'batch_stat_pos': {
'var': None,
'data_type': 'int64'
},
'batch_stat_neg': {
'var': None,
'data_type': 'int64'
},
'pos_ins_num': {
'var': None
},
'abserr': {
'var': None
},
'sqrerr': {
'var': None
},
'prob': {
'var': None
},
'total_ins_num': {
'var': None
},
'q': {
'var': None
}
} }
} }
...@@ -236,9 +290,12 @@ class LogLossLayer(Layer): ...@@ -236,9 +290,12 @@ class LogLossLayer(Layer):
""" """
input_layer = param['layer'][self._input[0]] input_layer = param['layer'][self._input[0]]
label_layer = param['layer'][self._label] label_layer = param['layer'][self._label]
output = fluid.layers.clip(input_layer, self._bound[0], self._bound[1], name=self._name) output = fluid.layers.clip(
input_layer, self._bound[0], self._bound[1], name=self._name)
norm = fluid.layers.sigmoid(output, name=self._name) norm = fluid.layers.sigmoid(output, name=self._name)
output = fluid.layers.log_loss(norm, fluid.layers.cast(x=label_layer, dtype='float32')) output = fluid.layers.log_loss(
norm, fluid.layers.cast(
x=label_layer, dtype='float32'))
if self._weight: if self._weight:
weight_layer = param['layer'][self._weight] weight_layer = param['layer'][self._weight]
output = fluid.layers.elementwise_mul(output, weight_layer) output = fluid.layers.elementwise_mul(output, weight_layer)
...@@ -248,7 +305,11 @@ class LogLossLayer(Layer): ...@@ -248,7 +305,11 @@ class LogLossLayer(Layer):
# For AUC Metric # For AUC Metric
metric = self._extend_output['metric_dict'] metric = self._extend_output['metric_dict']
binary_predict = fluid.layers.concat( binary_predict = fluid.layers.concat(
input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1) input=[
fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm),
norm
],
axis=1)
metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \ metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \
metric['batch_stat_neg']['var'], metric['stat_pos']['var'], metric['batch_stat_neg']['var'], metric['stat_pos']['var'],
metric['stat_neg']['var']] = \ metric['stat_neg']['var']] = \
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import abc import abc
...@@ -44,3 +45,65 @@ class Reader(dg.MultiSlotDataGenerator): ...@@ -44,3 +45,65 @@ class Reader(dg.MultiSlotDataGenerator):
@abc.abstractmethod @abc.abstractmethod
def generate_sample(self, line): def generate_sample(self, line):
pass pass
class SlotReader(dg.MultiSlotDataGenerator):
__metaclass__ = abc.ABCMeta
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
envs.set_global_envs(_config)
envs.update_workspace()
def init(self, sparse_slots, dense_slots, padding=0):
from operator import mul
self.sparse_slots = sparse_slots.strip().split(" ")
self.dense_slots = dense_slots.strip().split(" ")
self.dense_slots_shape = [
reduce(mul,
[int(j) for j in i.split(":")[1].strip("[]").split(",")])
for i in self.dense_slots
]
self.dense_slots = [i.split(":")[0] for i in self.dense_slots]
self.slots = self.dense_slots + self.sparse_slots
self.slot2index = {}
self.visit = {}
for i in range(len(self.slots)):
self.slot2index[self.slots[i]] = i
self.visit[self.slots[i]] = False
self.padding = padding
def generate_sample(self, l):
def reader():
line = l.strip().split(" ")
output = [(i, []) for i in self.slots]
for i in line:
slot_feasign = i.split(":")
slot = slot_feasign[0]
if slot not in self.slots:
continue
if slot in self.sparse_slots:
feasign = int(slot_feasign[1])
else:
feasign = float(slot_feasign[1])
output[self.slot2index[slot]][1].append(feasign)
self.visit[slot] = True
for i in self.visit:
slot = i
if not self.visit[slot]:
if i in self.dense_slots:
output[self.slot2index[i]][1].extend(
[self.padding] *
self.dense_slots_shape[self.slot2index[i]])
else:
output[self.slot2index[i]][1].extend([self.padding])
else:
self.visit[slot] = False
yield output
return reader
...@@ -30,8 +30,10 @@ class Trainer(object): ...@@ -30,8 +30,10 @@ class Trainer(object):
def __init__(self, config=None): def __init__(self, config=None):
self._status_processor = {} self._status_processor = {}
self._place = fluid.CPUPlace() self._place = fluid.CPUPlace()
self._exe = fluid.Executor(self._place) self._exe = fluid.Executor(self._place)
self._exector_context = {} self._exector_context = {}
self._context = {'status': 'uninit', 'is_exit': False} self._context = {'status': 'uninit', 'is_exit': False}
self._config_yaml = config self._config_yaml = config
...@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml): ...@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml):
train_dirname = os.path.dirname(train_location) train_dirname = os.path.dirname(train_location)
base_name = os.path.splitext(os.path.basename(train_location))[0] base_name = os.path.splitext(os.path.basename(train_location))[0]
sys.path.append(train_dirname) sys.path.append(train_dirname)
trainer_class = envs.lazy_instance_by_fliename( trainer_class = envs.lazy_instance_by_fliename(base_name,
base_name, "UserDefineTraining") "UserDefineTraining")
return trainer_class return trainer_class
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
trainer implement. trainer implement.
...@@ -22,5 +21,3 @@ Trainer ...@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer ↘ (for online learning training) OnlineLearningTrainer
""" """
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer): ...@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance) self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init) self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup) self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train) self.regist_context_processor('train_pass', self.dataset_train)
else: else:
self.regist_context_processor( self.regist_context_processor('train_pass',
'train_pass', self.dataloader_train) self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal) self.regist_context_processor('terminal_pass', self.terminal)
...@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer): ...@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer):
def init(self, context): def init(self, context):
self.model.train_net() self.model.train_net()
optimizer = self.model.optimizer() optimizer = self.model.optimizer()
optimizer_name = envs.get_global_env( optimizer_name = envs.get_global_env("hyper_parameters.optimizer",
"hyper_parameters.optimizer", None, "train.model") None, "train.model")
if optimizer_name not in ["", "sgd", "SGD", "Sgd"]: if optimizer_name not in ["", "sgd", "SGD", "Sgd"]:
os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0' os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0'
...@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer): ...@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer):
program = fluid.compiler.CompiledProgram( program = fluid.compiler.CompiledProgram(
fleet.main_program).with_data_parallel( fleet.main_program).with_data_parallel(
loss_name=self.model.get_avg_cost().name, loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(), build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy()) exec_strategy=self.strategy.get_execute_strategy())
metrics_varnames = [] metrics_varnames = []
metrics_format = [] metrics_format = []
...@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer): ...@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer):
batch_id = 0 batch_id = 0
try: try:
while True: while True:
metrics_rets = self._exe.run( metrics_rets = self._exe.run(program=program,
program=program, fetch_list=metrics_varnames)
fetch_list=metrics_varnames)
metrics = [epoch, batch_id] metrics = [epoch, batch_id]
metrics.extend(metrics_rets) metrics.extend(metrics_rets)
...@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer): ...@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer):
for i in range(epochs): for i in range(epochs):
begin_time = time.time() begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(), self._exe.train_from_dataset(
dataset=dataset, program=fluid.default_main_program(),
fetch_list=self.fetch_vars, dataset=dataset,
fetch_info=self.fetch_alias, fetch_list=self.fetch_vars,
print_period=self.fetch_period) fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time() end_time = time.time()
times = end_time-begin_time times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True) self.save(i, "train", is_fleet=True)
fleet.stop_worker() fleet.stop_worker()
......
...@@ -59,8 +59,10 @@ class CtrTrainer(Trainer): ...@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
reader_class = envs.get_global_env("class", None, namespace) reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__)) abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", self._config_yaml) pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
train_data_path = envs.get_global_env("train_data_path", None, namespace) self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
dataset = fluid.DatasetFactory().create_dataset() dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs) dataset.set_use_var(inputs)
...@@ -87,7 +89,8 @@ class CtrTrainer(Trainer): ...@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
self.model.train_net() self.model.train_net()
optimizer = self.model.optimizer() optimizer = self.model.optimizer()
optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False}) optimizer = fleet.distributed_optimizer(
optimizer, strategy={"use_cvm": False})
optimizer.minimize(self.model.get_avg_cost()) optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server(): if fleet.is_server():
...@@ -118,16 +121,18 @@ class CtrTrainer(Trainer): ...@@ -118,16 +121,18 @@ class CtrTrainer(Trainer):
gs = shuf * 0 gs = shuf * 0
fleet._role_maker._node_type_comm.Allreduce(shuf, gs) fleet._role_maker._node_type_comm.Allreduce(shuf, gs)
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(), fleet.worker_num(), gs)) print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(
), fleet.worker_num(), gs))
epochs = envs.get_global_env("train.epochs") epochs = envs.get_global_env("train.epochs")
for i in range(epochs): for i in range(epochs):
self._exe.train_from_dataset(program=fluid.default_main_program(), self._exe.train_from_dataset(
dataset=dataset, program=fluid.default_main_program(),
fetch_list=self.fetch_vars, dataset=dataset,
fetch_info=self.fetch_alias, fetch_list=self.fetch_vars,
print_period=self.fetch_period) fetch_info=self.fetch_alias,
print_period=self.fetch_period)
context['status'] = 'terminal_pass' context['status'] = 'terminal_pass'
fleet.stop_worker() fleet.stop_worker()
......
此差异已折叠。
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer): ...@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance) self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init) self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup) self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train) self.regist_context_processor('train_pass', self.dataset_train)
else: else:
self.regist_context_processor( self.regist_context_processor('train_pass',
'train_pass', self.dataloader_train) self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal) self.regist_context_processor('terminal_pass', self.terminal)
...@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer): ...@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer):
if state == "TRAIN": if state == "TRAIN":
inputs = self.model.get_inputs() inputs = self.model.get_inputs()
namespace = "train.reader" namespace = "train.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("train_data_path", None,
"train_data_path", None, namespace) namespace)
else: else:
inputs = self.model.get_infer_inputs() inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader" namespace = "evaluate.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("test_data_path", None,
"test_data_path", None, namespace) namespace)
threads = int(envs.get_runtime_environ("train.trainer.threads")) threads = int(envs.get_runtime_environ("train.trainer.threads"))
batch_size = envs.get_global_env("batch_size", None, namespace) batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace) reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__)) abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format( pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
reader, reader_class, state, self._config_yaml) self._config_yaml)
if train_data_path.startswith("paddlerec::"): if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE") package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None assert package_base is not None
train_data_path = os.path.join( train_data_path = os.path.join(package_base,
package_base, train_data_path.split("::")[1]) train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset() dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs) dataset.set_use_var(inputs)
...@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer): ...@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer):
ins = self._get_dataset_ins() ins = self._get_dataset_ins()
begin_time = time.time() begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(), self._exe.train_from_dataset(
dataset=dataset, program=fluid.default_main_program(),
fetch_list=self.fetch_vars, dataset=dataset,
fetch_info=self.fetch_alias, fetch_list=self.fetch_vars,
print_period=self.fetch_period) fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time() end_time = time.time()
times = end_time-begin_time times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True) self.save(i, "train", is_fleet=True)
fleet.stop_worker() fleet.stop_worker()
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer): ...@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance) self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init) self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup) self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None,
"train.reader") != "DataLoader": if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train) self.regist_context_processor('train_pass', self.dataset_train)
else: else:
self.regist_context_processor('train_pass', self.dataloader_train) self.regist_context_processor('train_pass', self.dataloader_train)
...@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer): ...@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer):
reader = self._get_dataloader("TRAIN") reader = self._get_dataloader("TRAIN")
epochs = envs.get_global_env("train.epochs") epochs = envs.get_global_env("train.epochs")
program = fluid.compiler.CompiledProgram( program = fluid.compiler.CompiledProgram(fluid.default_main_program(
fluid.default_main_program()).with_data_parallel( )).with_data_parallel(loss_name=self.model.get_avg_cost().name)
loss_name=self.model.get_avg_cost().name)
metrics_varnames = [] metrics_varnames = []
metrics_format = [] metrics_format = []
...@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer): ...@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer):
batch_id = 0 batch_id = 0
try: try:
while True: while True:
metrics_rets = self._exe.run( metrics_rets = self._exe.run(program=program,
program=program, fetch_list=metrics_varnames)
fetch_list=metrics_varnames)
metrics = [epoch, batch_id] metrics = [epoch, batch_id]
metrics.extend(metrics_rets) metrics.extend(metrics_rets)
...@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer): ...@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer):
epochs = envs.get_global_env("train.epochs") epochs = envs.get_global_env("train.epochs")
for i in range(epochs): for i in range(epochs):
begin_time = time.time() begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(), self._exe.train_from_dataset(
dataset=dataset, program=fluid.default_main_program(),
fetch_list=self.fetch_vars, dataset=dataset,
fetch_info=self.fetch_alias, fetch_list=self.fetch_vars,
print_period=self.fetch_period) fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time() end_time = time.time()
times = end_time - begin_time times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times)) print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=False) self.save(i, "train", is_fleet=False)
context['status'] = 'infer_pass' context['status'] = 'infer_pass'
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"] ...@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"]
class TDMClusterTrainer(ClusterTrainer): class TDMClusterTrainer(ClusterTrainer):
def server(self, context): def server(self, context):
namespace = "train.startup" namespace = "train.startup"
init_model_path = envs.get_global_env( init_model_path = envs.get_global_env("cluster.init_model_path", "",
"cluster.init_model_path", "", namespace) namespace)
assert init_model_path != "", "Cluster train must has init_model for TDM" assert init_model_path != "", "Cluster train must has init_model for TDM"
fleet.init_server(init_model_path) fleet.init_server(init_model_path)
logger.info("TDM: load model from {}".format(init_model_path)) logger.info("TDM: load model from {}".format(init_model_path))
...@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer): ...@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer):
self._exe.run(fleet.startup_program) self._exe.run(fleet.startup_program)
namespace = "train.startup" namespace = "train.startup"
load_tree = envs.get_global_env( load_tree = envs.get_global_env("tree.load_tree", True, namespace)
"tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env( self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
"tree.tree_layer_path", "", namespace) namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace) self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
self.tree_info_path = envs.get_global_env( "", namespace)
"tree.tree_info_path", "", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
save_init_model = envs.get_global_env( namespace)
"cluster.save_init_model", False, namespace)
init_model_path = envs.get_global_env( save_init_model = envs.get_global_env("cluster.save_init_model", False,
"cluster.init_model_path", "", namespace) namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
if load_tree: if load_tree:
# covert tree to tensor, set it into Fluid's variable. # covert tree to tensor, set it into Fluid's variable.
for param_name in special_param: for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor() param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name) param_array = self._tdm_prepare(param_name)
param_t.set(param_array.astype('int32'), self._place) param_t.set(param_array.astype('int32'), self._place)
...@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer): ...@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer):
def _tdm_travel_prepare(self): def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file""" """load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path) travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format( logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
travel_array.shape[0])) 0]))
return travel_array return travel_array
def _tdm_layer_prepare(self): def _tdm_layer_prepare(self):
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs ...@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", special_param = [
"TDM_Tree_Info", "TDM_Tree_Emb"] "TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info", "TDM_Tree_Emb"
]
class TDMSingleTrainer(SingleTrainer): class TDMSingleTrainer(SingleTrainer):
def startup(self, context): def startup(self, context):
namespace = "train.startup" namespace = "train.startup"
load_persistables = envs.get_global_env( load_persistables = envs.get_global_env("single.load_persistables",
"single.load_persistables", False, namespace) False, namespace)
persistables_model_path = envs.get_global_env( persistables_model_path = envs.get_global_env(
"single.persistables_model_path", "", namespace) "single.persistables_model_path", "", namespace)
load_tree = envs.get_global_env( load_tree = envs.get_global_env("tree.load_tree", False, namespace)
"tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env( self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
"tree.tree_layer_path", "", namespace) namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace) self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
self.tree_info_path = envs.get_global_env( "", namespace)
"tree.tree_info_path", "", namespace)
self.tree_emb_path = envs.get_global_env( self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
"tree.tree_emb_path", "", namespace) namespace)
save_init_model = envs.get_global_env( self.tree_emb_path = envs.get_global_env("tree.tree_emb_path", "",
"single.save_init_model", False, namespace) namespace)
init_model_path = envs.get_global_env(
"single.init_model_path", "", namespace) save_init_model = envs.get_global_env("single.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("single.init_model_path", "",
namespace)
self._exe.run(fluid.default_startup_program()) self._exe.run(fluid.default_startup_program())
if load_persistables: if load_persistables:
...@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer): ...@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer):
if load_tree: if load_tree:
# covert tree to tensor, set it into Fluid's variable. # covert tree to tensor, set it into Fluid's variable.
for param_name in special_param: for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor() param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name) param_array = self._tdm_prepare(param_name)
if param_name == 'TDM_Tree_Emb': if param_name == 'TDM_Tree_Emb':
param_t.set(param_array.astype('float32'), self._place) param_t.set(param_array.astype('float32'), self._place)
...@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer): ...@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer):
def _tdm_travel_prepare(self): def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file""" """load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path) travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format( logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
travel_array.shape[0])) 0]))
return travel_array return travel_array
def _tdm_emb_prepare(self): def _tdm_emb_prepare(self):
"""load tdm tree param from npy/list file""" """load tdm tree param from npy/list file"""
emb_array = np.load(self.tree_emb_path) emb_array = np.load(self.tree_emb_path)
logger.info("TDM Tree node nums from emb: {}".format( logger.info("TDM Tree node nums from emb: {}".format(emb_array.shape[
emb_array.shape[0])) 0]))
return emb_array return emb_array
def _tdm_layer_prepare(self): def _tdm_layer_prepare(self):
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with DistributeTranspiler Training use fluid with DistributeTranspiler
""" """
...@@ -23,6 +22,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import f ...@@ -23,6 +22,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import f
from paddlerec.core.trainer import Trainer from paddlerec.core.trainer import Trainer
from paddlerec.core.utils import envs from paddlerec.core.utils import envs
from paddlerec.core.utils import dataloader_instance from paddlerec.core.utils import dataloader_instance
from paddlerec.core.reader import SlotReader
class TranspileTrainer(Trainer): class TranspileTrainer(Trainer):
...@@ -38,9 +38,12 @@ class TranspileTrainer(Trainer): ...@@ -38,9 +38,12 @@ class TranspileTrainer(Trainer):
self.increment_models = [] self.increment_models = []
def processor_register(self): def processor_register(self):
print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first") print(
"Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first"
)
def _get_dataloader(self, state="TRAIN"): def _get_dataloader(self, state="TRAIN"):
if state == "TRAIN": if state == "TRAIN":
dataloader = self.model._data_loader dataloader = self.model._data_loader
namespace = "train.reader" namespace = "train.reader"
...@@ -50,14 +53,24 @@ class TranspileTrainer(Trainer): ...@@ -50,14 +53,24 @@ class TranspileTrainer(Trainer):
namespace = "evaluate.reader" namespace = "evaluate.reader"
class_name = "EvaluateReader" class_name = "EvaluateReader"
sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
dense_slots = envs.get_global_env("dense_slots", None, namespace)
batch_size = envs.get_global_env("batch_size", None, namespace) batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
print("batch_size: {}".format(batch_size)) print("batch_size: {}".format(batch_size))
reader = dataloader_instance.dataloader(
reader_class, state, self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class, class_name) if sparse_slots is None and dense_slots is None:
reader_ins = reader_class(self._config_yaml) reader_class = envs.get_global_env("class", None, namespace)
reader = dataloader_instance.dataloader(reader_class, state,
self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class,
class_name)
reader_ins = reader_class(self._config_yaml)
else:
reader = dataloader_instance.slotdataloader("", state,
self._config_yaml)
reader_ins = SlotReader(self._config_yaml)
if hasattr(reader_ins, 'generate_batch_from_trainfiles'): if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
dataloader.set_sample_list_generator(reader) dataloader.set_sample_list_generator(reader)
else: else:
...@@ -85,27 +98,37 @@ class TranspileTrainer(Trainer): ...@@ -85,27 +98,37 @@ class TranspileTrainer(Trainer):
if state == "TRAIN": if state == "TRAIN":
inputs = self.model.get_inputs() inputs = self.model.get_inputs()
namespace = "train.reader" namespace = "train.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("train_data_path", None,
"train_data_path", None, namespace) namespace)
else: else:
inputs = self.model.get_infer_inputs() inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader" namespace = "evaluate.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("test_data_path", None,
"test_data_path", None, namespace) namespace)
sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
dense_slots = envs.get_global_env("dense_slots", None, namespace)
threads = int(envs.get_runtime_environ("train.trainer.threads")) threads = int(envs.get_runtime_environ("train.trainer.threads"))
batch_size = envs.get_global_env("batch_size", None, namespace) batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace) reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__)) abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(
reader, reader_class, state, self._config_yaml) if sparse_slots is None and dense_slots is None:
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
self._config_yaml)
else:
padding = envs.get_global_env("padding", 0, namespace)
pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
reader, "slot", "slot", self._config_yaml, namespace, \
sparse_slots.replace(" ", "#"), dense_slots.replace(" ", "#"), str(padding))
if train_data_path.startswith("paddlerec::"): if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE") package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None assert package_base is not None
train_data_path = os.path.join( train_data_path = os.path.join(package_base,
package_base, train_data_path.split("::")[1]) train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset() dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs) dataset.set_use_var(inputs)
...@@ -121,11 +144,11 @@ class TranspileTrainer(Trainer): ...@@ -121,11 +144,11 @@ class TranspileTrainer(Trainer):
debug_mode = envs.get_global_env("reader_debug_mode", False, namespace) debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
if debug_mode: if debug_mode:
print( print("--- Dataset Debug Mode Begin , show pre 10 data of {}---".
"--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0])) format(file_list[0]))
os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd)) os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
print( print("--- Dataset Debug Mode End , show pre 10 data of {}---".
"--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0])) format(file_list[0]))
exit(0) exit(0)
return dataset return dataset
...@@ -147,30 +170,29 @@ class TranspileTrainer(Trainer): ...@@ -147,30 +170,29 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False): if not need_save(epoch_id, save_interval, False):
return return
# print("save inference model is not supported now.") feed_varnames = envs.get_global_env("save.inference.feed_varnames",
# return None, namespace)
feed_varnames = envs.get_global_env(
"save.inference.feed_varnames", None, namespace)
fetch_varnames = envs.get_global_env( fetch_varnames = envs.get_global_env(
"save.inference.fetch_varnames", None, namespace) "save.inference.fetch_varnames", None, namespace)
if feed_varnames is None or fetch_varnames is None: if feed_varnames is None or fetch_varnames is None:
return return
fetch_vars = [fluid.default_main_program().global_block().vars[varname] fetch_vars = [
for varname in fetch_varnames] fluid.default_main_program().global_block().vars[varname]
dirname = envs.get_global_env( for varname in fetch_varnames
"save.inference.dirname", None, namespace) ]
dirname = envs.get_global_env("save.inference.dirname", None,
namespace)
assert dirname is not None assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id)) dirname = os.path.join(dirname, str(epoch_id))
if is_fleet: if is_fleet:
fleet.save_inference_model( fleet.save_inference_model(self._exe, dirname, feed_varnames,
self._exe, dirname, feed_varnames, fetch_vars) fetch_vars)
else: else:
fluid.io.save_inference_model( fluid.io.save_inference_model(dirname, feed_varnames,
dirname, feed_varnames, fetch_vars, self._exe) fetch_vars, self._exe)
self.inference_models.append((epoch_id, dirname)) self.inference_models.append((epoch_id, dirname))
def save_persistables(): def save_persistables():
...@@ -180,8 +202,8 @@ class TranspileTrainer(Trainer): ...@@ -180,8 +202,8 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False): if not need_save(epoch_id, save_interval, False):
return return
dirname = envs.get_global_env( dirname = envs.get_global_env("save.increment.dirname", None,
"save.increment.dirname", None, namespace) namespace)
assert dirname is not None assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id)) dirname = os.path.join(dirname, str(epoch_id))
...@@ -259,10 +281,9 @@ class TranspileTrainer(Trainer): ...@@ -259,10 +281,9 @@ class TranspileTrainer(Trainer):
batch_id = 0 batch_id = 0
try: try:
while True: while True:
metrics_rets = self._exe.run( metrics_rets = self._exe.run(program=program,
program=program, fetch_list=metrics_varnames,
fetch_list=metrics_varnames, return_numpy=is_return_numpy)
return_numpy=is_return_numpy)
metrics = [epoch, batch_id] metrics = [epoch, batch_id]
metrics.extend(metrics_rets) metrics.extend(metrics_rets)
......
...@@ -18,6 +18,7 @@ import os ...@@ -18,6 +18,7 @@ import os
from paddlerec.core.utils.envs import lazy_instance_by_fliename from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.utils.envs import get_global_env from paddlerec.core.utils.envs import get_global_env
from paddlerec.core.utils.envs import get_runtime_environ from paddlerec.core.utils.envs import get_runtime_environ
from paddlerec.core.reader import SlotReader
def dataloader(readerclass, train, yaml_file): def dataloader(readerclass, train, yaml_file):
...@@ -62,3 +63,49 @@ def dataloader(readerclass, train, yaml_file): ...@@ -62,3 +63,49 @@ def dataloader(readerclass, train, yaml_file):
if hasattr(reader, 'generate_batch_from_trainfiles'): if hasattr(reader, 'generate_batch_from_trainfiles'):
return gen_batch_reader() return gen_batch_reader()
return gen_reader return gen_reader
def slotdataloader(readerclass, train, yaml_file):
if train == "TRAIN":
reader_name = "SlotReader"
namespace = "train.reader"
data_path = get_global_env("train_data_path", None, namespace)
else:
reader_name = "SlotReader"
namespace = "evaluate.reader"
data_path = get_global_env("test_data_path", None, namespace)
if data_path.startswith("paddlerec::"):
package_base = get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
sparse = get_global_env("sparse_slots", None, namespace)
dense = get_global_env("dense_slots", None, namespace)
padding = get_global_env("padding", 0, namespace)
reader = SlotReader(yaml_file)
reader.init(sparse, dense, int(padding))
def gen_reader():
for file in files:
with open(file, 'r') as f:
for line in f:
line = line.rstrip('\n')
iter = reader.generate_sample(line)
for parsed_line in iter():
if parsed_line is None:
continue
else:
values = []
for pased in parsed_line:
values.append(pased[1])
yield values
def gen_batch_reader():
return reader.generate_batch_from_trainfiles(files)
if hasattr(reader, 'generate_batch_from_trainfiles'):
return gen_batch_reader()
return gen_reader
...@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util ...@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util
class DatasetHolder(object): class DatasetHolder(object):
""" """
Dataset Base Dataset Holder
""" """
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
...@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder):
Dataset.__init__(self, config) Dataset.__init__(self, config)
if 'data_donefile' not in config or config['data_donefile'] is None: if 'data_donefile' not in config or config['data_donefile'] is None:
config['data_donefile'] = config['data_path'] + "/to.hadoop.done" config['data_donefile'] = config['data_path'] + "/to.hadoop.done"
self._path_generator = util.PathGenerator({'templates': [ self._path_generator = util.PathGenerator({
{'name': 'data_path', 'template': config['data_path']}, 'templates': [{
{'name': 'donefile_path', 'template': config['data_donefile']} 'name': 'data_path',
]}) 'template': config['data_path']
self._split_interval = config['split_interval'] # data split N mins per dir }, {
'name': 'donefile_path',
'template': config['data_donefile']
}]
})
self._split_interval = config[
'split_interval'] # data split N mins per dir
self._data_file_handler = fs.FileHandler(config) self._data_file_handler = fs.FileHandler(config)
def _format_data_time(self, daytime_str, time_window_mins): def _format_data_time(self, daytime_str, time_window_mins):
...@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder):
return None, 0 return None, 0
if mins_of_day % self._split_interval != 0: if mins_of_day % self._split_interval != 0:
skip_mins = self._split_interval - (mins_of_day % self._split_interval) skip_mins = self._split_interval - (mins_of_day %
self._split_interval)
data_time = data_time + datetime.timedelta(minutes=skip_mins) data_time = data_time + datetime.timedelta(minutes=skip_mins)
time_window_mins = time_window_mins - skip_mins time_window_mins = time_window_mins - skip_mins
return data_time, time_window_mins return data_time, time_window_mins
...@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder):
True/False True/False
""" """
is_ready = True is_ready = True
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0: while time_window_mins > 0:
file_path = self._path_generator.generate_path('donefile_path', {'time_format': data_time}) file_path = self._path_generator.generate_path(
'donefile_path', {'time_format': data_time})
if not self._data_file_handler.is_exist(file_path): if not self._data_file_handler.is_exist(file_path):
is_ready = False is_ready = False
break break
time_window_mins = time_window_mins - self._split_interval time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval) data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return is_ready return is_ready
def get_file_list(self, daytime_str, time_window_mins, node_num=1, node_idx=0): def get_file_list(self,
daytime_str,
time_window_mins,
node_num=1,
node_idx=0):
""" """
data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx] data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx]
Args: Args:
...@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder):
list, data_shard[node_idx] list, data_shard[node_idx]
""" """
data_file_list = [] data_file_list = []
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0: while time_window_mins > 0:
file_path = self._path_generator.generate_path('data_path', {'time_format': data_time}) file_path = self._path_generator.generate_path(
'data_path', {'time_format': data_time})
sub_file_list = self._data_file_handler.ls(file_path) sub_file_list = self._data_file_handler.ls(file_path)
for sub_file in sub_file_list: for sub_file in sub_file_list:
sub_file_name = self._data_file_handler.get_file_name(sub_file) sub_file_name = self._data_file_handler.get_file_name(sub_file)
if not sub_file_name.startswith(self._config['filename_prefix']): if not sub_file_name.startswith(self._config[
'filename_prefix']):
continue continue
if hash(sub_file_name) % node_num == node_idx: if hash(sub_file_name) % node_num == node_idx:
data_file_list.append(sub_file) data_file_list.append(sub_file)
time_window_mins = time_window_mins - self._split_interval time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval) data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return data_file_list return data_file_list
def _alloc_dataset(self, file_list): def _alloc_dataset(self, file_list):
""" """ """ """
dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type']) dataset = fluid.DatasetFactory().create_dataset(self._config[
'dataset_type'])
dataset.set_batch_size(self._config['batch_size']) dataset.set_batch_size(self._config['batch_size'])
dataset.set_thread(self._config['load_thread']) dataset.set_thread(self._config['load_thread'])
dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi']) dataset.set_hdfs_config(self._config['fs_name'],
self._config['fs_ugi'])
dataset.set_pipe_command(self._config['data_converter']) dataset.set_pipe_command(self._config['data_converter'])
dataset.set_filelist(file_list) dataset.set_filelist(file_list)
dataset.set_use_var(self._config['data_vars']) dataset.set_use_var(self._config['data_vars'])
...@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder):
while self.check_ready(begin_time, windown_min) == False: while self.check_ready(begin_time, windown_min) == False:
print("dataset not ready, time:" + begin_time) print("dataset not ready, time:" + begin_time)
time.sleep(30) time.sleep(30)
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list) self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].load_into_memory() self._datasets[begin_time].load_into_memory()
else: else:
...@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min = params['time_window_min'] windown_min = params['time_window_min']
if begin_time not in self._datasets: if begin_time not in self._datasets:
if self.check_ready(begin_time, windown_min): if self.check_ready(begin_time, windown_min):
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list) self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].preload_into_memory(self._config['preload_thread']) self._datasets[begin_time].preload_into_memory(self._config[
'preload_thread'])
return True return True
return False return False
......
...@@ -16,19 +16,34 @@ from __future__ import print_function ...@@ -16,19 +16,34 @@ from __future__ import print_function
import sys import sys
from paddlerec.core.utils.envs import lazy_instance_by_fliename from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.reader import SlotReader
if len(sys.argv) != 4: if len(sys.argv) < 4:
raise ValueError("reader only accept 3 argument: 1. reader_class 2.train/evaluate 3.yaml_abs_path") raise ValueError(
"reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path"
)
reader_package = sys.argv[1] reader_package = sys.argv[1]
if sys.argv[2] == "TRAIN": if sys.argv[2].upper() == "TRAIN":
reader_name = "TrainReader" reader_name = "TrainReader"
else: elif sys.argv[2].upper() == "EVALUATE":
reader_name = "EvaluateReader" reader_name = "EvaluateReader"
else:
reader_name = "SlotReader"
namespace = sys.argv[4]
sparse_slots = sys.argv[5].replace("#", " ")
dense_slots = sys.argv[6].replace("#", " ")
padding = int(sys.argv[7])
yaml_abs_path = sys.argv[3] yaml_abs_path = sys.argv[3]
reader_class = lazy_instance_by_fliename(reader_package, reader_name)
reader = reader_class(yaml_abs_path) if reader_name != "SlotReader":
reader.init() reader_class = lazy_instance_by_fliename(reader_package, reader_name)
reader.run_from_stdin() reader = reader_class(yaml_abs_path)
reader.init()
reader.run_from_stdin()
else:
reader = SlotReader(yaml_abs_path)
reader.init(sparse_slots, dense_slots, padding)
reader.run_from_stdin()
...@@ -95,7 +95,7 @@ def path_adapter(path): ...@@ -95,7 +95,7 @@ def path_adapter(path):
l_p = path.split("paddlerec.")[1].replace(".", "/") l_p = path.split("paddlerec.")[1].replace(".", "/")
return os.path.join(package, l_p) return os.path.join(package, l_p)
else: else:
return path return path
def windows_path_converter(path): def windows_path_converter(path):
...@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None): ...@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None):
def lazy_instance_by_package(package, class_name): def lazy_instance_by_package(package, class_name):
models = get_global_env("train.model.models") models = get_global_env("train.model.models")
model_package = __import__( model_package = __import__(package,
package, globals(), locals(), package.split(".")) globals(), locals(), package.split("."))
instance = getattr(model_package, class_name) instance = getattr(model_package, class_name)
return instance return instance
...@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name): ...@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name):
sys.path.append(dirname) sys.path.append(dirname)
package = os.path.splitext(os.path.basename(abs))[0] package = os.path.splitext(os.path.basename(abs))[0]
model_package = __import__( model_package = __import__(package,
package, globals(), locals(), package.split(".")) globals(), locals(), package.split("."))
instance = getattr(model_package, class_name) instance = getattr(model_package, class_name)
return instance return instance
...@@ -189,8 +189,7 @@ def get_platform(): ...@@ -189,8 +189,7 @@ def get_platform():
def find_free_port(): def find_free_port():
def __free_port(): def __free_port():
with closing(socket.socket(socket.AF_INET, with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
socket.SOCK_STREAM)) as s:
s.bind(('', 0)) s.bind(('', 0))
return s.getsockname()[1] return s.getsockname()[1]
......
...@@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient ...@@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
def is_afs_path(path): def is_afs_path(path):
"""R """is_afs_path
""" """
if path.startswith("afs") or path.startswith("hdfs"): if path.startswith("afs") or path.startswith("hdfs"):
return True return True
...@@ -133,8 +133,9 @@ class FileHandler(object): ...@@ -133,8 +133,9 @@ class FileHandler(object):
if mode.find('a') >= 0: if mode.find('a') >= 0:
org_content = self._hdfs_client.cat(dest_path) org_content = self._hdfs_client.cat(dest_path)
content = content + org_content content = content + org_content
self._local_fs_client.write(content, temp_local_file, self._local_fs_client.write(
mode) # fleet hdfs_client only support upload, so write tmp file content, temp_local_file, mode
) # fleet hdfs_client only support upload, so write tmp file
self._hdfs_client.delete(dest_path + ".tmp") self._hdfs_client.delete(dest_path + ".tmp")
self._hdfs_client.upload(dest_path + ".tmp", temp_local_file) self._hdfs_client.upload(dest_path + ".tmp", temp_local_file)
self._hdfs_client.delete(dest_path + ".bak") self._hdfs_client.delete(dest_path + ".bak")
...@@ -158,7 +159,8 @@ class FileHandler(object): ...@@ -158,7 +159,8 @@ class FileHandler(object):
files = [] files = []
if is_afs_path(path): if is_afs_path(path):
files = self._hdfs_client.ls(path) files = self._hdfs_client.ls(path)
files = [path + '/' + self.get_file_name(fi) for fi in files] # absulte path files = [path + '/' + self.get_file_name(fi)
for fi in files] # absulte path
else: else:
files = self._local_fs_client.ls(path) files = self._local_fs_client.ls(path)
files = [path + '/' + fi for fi in files] # absulte path files = [path + '/' + fi for fi in files] # absulte path
......
...@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs ...@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs
def save_program_proto(path, program=None): def save_program_proto(path, program=None):
if program is None: if program is None:
_program = fluid.default_main_program() _program = fluid.default_main_program()
else: else:
...@@ -175,7 +176,8 @@ class PathGenerator(object): ...@@ -175,7 +176,8 @@ class PathGenerator(object):
""" """
if template_name in self._templates: if template_name in self._templates:
if 'time_format' in param: if 'time_format' in param:
str = param['time_format'].strftime(self._templates[template_name]) str = param['time_format'].strftime(self._templates[
template_name])
return str.format(**param) return str.format(**param)
return self._templates[template_name].format(**param) return self._templates[template_name].format(**param)
else: else:
...@@ -198,31 +200,39 @@ class TimeTrainPass(object): ...@@ -198,31 +200,39 @@ class TimeTrainPass(object):
self._begin_day = make_datetime(day_fields[0].strip()) self._begin_day = make_datetime(day_fields[0].strip())
if len(day_fields) == 1 or len(day_fields[1]) == 0: if len(day_fields) == 1 or len(day_fields[1]) == 0:
# 100 years, meaning to continuous running # 100 years, meaning to continuous running
self._end_day = self._begin_day + datetime.timedelta(days=36500) self._end_day = self._begin_day + datetime.timedelta(
days=36500)
else: else:
# example: 2020212+10 # example: 2020212+10
run_day = int(day_fields[1].strip()) run_day = int(day_fields[1].strip())
self._end_day = self._begin_day + datetime.timedelta(days=run_day) self._end_day = self._begin_day + datetime.timedelta(
days=run_day)
else: else:
# example: {20191001..20191031} # example: {20191001..20191031}
days = os.popen("echo -n " + self._config['days']).read().split(" ") days = os.popen("echo -n " + self._config['days']).read().split(
" ")
self._begin_day = make_datetime(days[0]) self._begin_day = make_datetime(days[0])
self._end_day = make_datetime(days[len(days) - 1]) self._end_day = make_datetime(days[len(days) - 1])
self._checkpoint_interval = self._config['checkpoint_interval'] self._checkpoint_interval = self._config['checkpoint_interval']
self._dump_inference_interval = self._config['dump_inference_interval'] self._dump_inference_interval = self._config['dump_inference_interval']
self._interval_per_pass = self._config['train_time_interval'] # train N min data per pass self._interval_per_pass = self._config[
'train_time_interval'] # train N min data per pass
self._pass_id = 0 self._pass_id = 0
self._inference_pass_id = 0 self._inference_pass_id = 0
self._pass_donefile_handler = None self._pass_donefile_handler = None
if 'pass_donefile_name' in self._config: if 'pass_donefile_name' in self._config:
self._train_pass_donefile = global_config['output_path'] + '/' + self._config['pass_donefile_name'] self._train_pass_donefile = global_config[
'output_path'] + '/' + self._config['pass_donefile_name']
if fs.is_afs_path(self._train_pass_donefile): if fs.is_afs_path(self._train_pass_donefile):
self._pass_donefile_handler = fs.FileHandler(global_config['io']['afs']) self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['afs'])
else: else:
self._pass_donefile_handler = fs.FileHandler(global_config['io']['local_fs']) self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['local_fs'])
last_done = self._pass_donefile_handler.cat(self._train_pass_donefile).strip().split('\n')[-1] last_done = self._pass_donefile_handler.cat(
self._train_pass_donefile).strip().split('\n')[-1]
done_fileds = last_done.split('\t') done_fileds = last_done.split('\t')
if len(done_fileds) > 4: if len(done_fileds) > 4:
self._base_key = done_fileds[1] self._base_key = done_fileds[1]
...@@ -236,15 +246,18 @@ class TimeTrainPass(object): ...@@ -236,15 +246,18 @@ class TimeTrainPass(object):
""" """
return 24 * 60 / self._interval_per_pass return 24 * 60 / self._interval_per_pass
def save_train_progress(self, day, pass_id, base_key, model_path, is_checkpoint): def save_train_progress(self, day, pass_id, base_key, model_path,
is_checkpoint):
"""R """R
""" """
if is_checkpoint: if is_checkpoint:
self._checkpoint_pass_id = pass_id self._checkpoint_pass_id = pass_id
self._checkpoint_model_path = model_path self._checkpoint_model_path = model_path
done_content = "%s\t%s\t%s\t%s\t%d\n" % (day, base_key, done_content = "%s\t%s\t%s\t%s\t%d\n" % (
self._checkpoint_model_path, self._checkpoint_pass_id, pass_id) day, base_key, self._checkpoint_model_path,
self._pass_donefile_handler.write(done_content, self._train_pass_donefile, 'a') self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content,
self._train_pass_donefile, 'a')
pass pass
def init_pass_by_id(self, date_str, pass_id): def init_pass_by_id(self, date_str, pass_id):
...@@ -286,12 +299,14 @@ class TimeTrainPass(object): ...@@ -286,12 +299,14 @@ class TimeTrainPass(object):
if self._pass_id < 1: if self._pass_id < 1:
self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M")) self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M"))
else: else:
next_time = self._current_train_time + datetime.timedelta(minutes=self._interval_per_pass) next_time = self._current_train_time + datetime.timedelta(
minutes=self._interval_per_pass)
if (next_time - self._end_day).total_seconds() > 0: if (next_time - self._end_day).total_seconds() > 0:
has_next = False has_next = False
else: else:
self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M")) self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M"))
if has_next and (self._inference_pass_id < self._pass_id or self._pass_id < old_pass_id): if has_next and (self._inference_pass_id < self._pass_id or
self._pass_id < old_pass_id):
self._inference_pass_id = self._pass_id - 1 self._inference_pass_id = self._pass_id - 1
return has_next return has_next
...@@ -319,9 +334,11 @@ class TimeTrainPass(object): ...@@ -319,9 +334,11 @@ class TimeTrainPass(object):
Return: Return:
date(current_train_time + delta_day) date(current_train_time + delta_day)
""" """
return (self._current_train_time + datetime.timedelta(days=delta_day)).strftime("%Y%m%d") return (self._current_train_time + datetime.timedelta(days=delta_day)
).strftime("%Y%m%d")
def timestamp(self, delta_day=0): def timestamp(self, delta_day=0):
"""R """R
""" """
return (self._current_train_time + datetime.timedelta(days=delta_day)).timestamp() return (self._current_train_time + datetime.timedelta(days=delta_day)
).timestamp()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# PaddleRec 贡献代码 # PaddleRec 贡献代码
> 占位 > 占位
\ No newline at end of file
# PaddleRec 推荐数据集格式
当你的数据集格式为[slot:feasign]*这种模式,或者可以预处理为这种格式时,可以直接使用PaddleRec内置的Reader。
好处是不用自己写Reader了,各个model之间的数据格式也都可以统一成一样的格式。
## 数据格式说明
假如你的原始数据格式为
```bash
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
```
其中```<label>```表示广告是否被点击,点击用1表示,未点击用0表示。```<integer feature>```代表数值特征(连续特征),共有13个连续特征。
并且每个特征有一个特征值。
```<categorical feature>```代表分类特征(离散特征),共有26个离散特征。相邻两个特征用```\t```分隔。
假设这13个连续特征(dense slot)的name如下:
```
D1 D2 D3 D4 D4 D6 D7 D8 D9 D10 D11 D12 D13
```
这26个离散特征(sparse slot)的name如下:
```
S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S20 S21 S22 S23 S24 S25 S26
```
那么下面这条样本(1个label + 13个dense值 + 26个feasign)
```
1 0.1 0.4 0.2 0.3 0.5 0.8 0.3 0.2 0.1 0.5 0.6 0.3 0.9 60 16 91 50 52 52 28 69 63 33 87 69 48 59 27 12 95 36 37 41 17 3 86 19 88 60
```
可以转换成:
```
label:1 D1:0.1 D2:0.4 D3:0.2 D4:0.3 D5:0.5 D6:0.8 D7:0.3 D8:0.2 D9:0.1 D10:0.5 D11:0.6 D12:0.3 D13:0.9 S14:60 S15:16 S16:91 S17:50 S18:52 S19:52 S20:28 S21:69 S22:63 S23:33 S24:87 S25:69 S26:48 S27:59 S28:27 S29:12 S30:95 S31:36 S32:37 S33:41 S34:17 S35:3 S36:86 S37:19 S38:88 S39:60
```
注意:上面各个slot:feasign字段之间的顺序没有要求,比如```D1:0.1 D2:0.4```改成```D2:0.4 D1:0.1```也可以。
## 配置
reader中需要配置```sparse_slots```与```dense_slots```,例如
```
workspace: xxxx
reader:
batch_size: 2
train_data_path: "{workspace}/data/train_data"
sparse_slots: "label S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S20 S21 S22 S23 S24 S25 S26"
dense_slots: "D1:1 D2:1 D3:1 D4:1 D4:1 D6:1 D7:1 D8:1 D9:1 D10:1 D11:1 D12:1 D13:1"
model:
xxxxx
```
sparse_slots表示稀疏特征的列表,以空格分开。
dense_slots表示稠密特征的列表,以空格分开。每个字段的格式是```[dense_slot_name]:[dim1,dim2,dim3...]```,其中```dim1,dim2,dim3...```表示shape
配置好了之后,这些slot对应的variable就可以在model中的如下变量啦:
```
self._sparse_data_var
self._dense_data_var
```
# PaddleRec 自定义数据集及Reader # PaddleRec 自定义数据集及Reader
用户自定义数据集及配置异步Reader,需要关注以下几个步骤: 用户自定义数据集及配置异步Reader,需要关注以下几个步骤:
......
...@@ -279,4 +279,4 @@ class Metric(object): ...@@ -279,4 +279,4 @@ class Metric(object):
pass pass
``` ```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py) 全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
\ No newline at end of file
...@@ -7,5 +7,3 @@ ...@@ -7,5 +7,3 @@
### K8S集群运行分布式 ### K8S集群运行分布式
> 占位 > 占位
# 常见问题FAQ # 常见问题FAQ
> 占位 > 占位
\ No newline at end of file
# PaddleRec 单机训练 # PaddleRec 单机训练
> 占位 > 占位
\ No newline at end of file
...@@ -12,4 +12,3 @@ ...@@ -12,4 +12,3 @@
| 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ |
# PaddleRec 模型调参 # PaddleRec 模型调参
> 占位 > 占位
\ No newline at end of file
# PaddleRec 离线预测 # PaddleRec 离线预测
\ No newline at end of file
...@@ -5,4 +5,3 @@ ...@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839) ## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -37,4 +37,3 @@ train: ...@@ -37,4 +37,3 @@ train:
dirname: "inference" dirname: "inference"
epoch_interval: 100 epoch_interval: 100
save_last: True save_last: True
...@@ -31,7 +31,8 @@ class Model(ModelBase): ...@@ -31,7 +31,8 @@ class Model(ModelBase):
def train_net(self): def train_net(self):
""" network definition """ """ network definition """
data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64') data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
...@@ -51,7 +52,9 @@ class Model(ModelBase): ...@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer # full connect layer
fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim) fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim)
# softmax layer # softmax layer
prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax") prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label) acc = fluid.layers.accuracy(input=prediction, label=label)
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys import sys
from paddlerec.core.reader import Reader from paddlerec.core.reader import Reader
...@@ -38,7 +37,8 @@ class TrainReader(Reader): ...@@ -38,7 +37,8 @@ class TrainReader(Reader):
data = [int(i) for i in data] data = [int(i) for i in data]
label = [int(i) for i in label] label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len] seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)] yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter return data_iter
...@@ -71,13 +71,13 @@ python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test ...@@ -71,13 +71,13 @@ python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test
### 训练 ### 训练
``` ```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification -d cpu -e single python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
``` ```
### 预测 ### 预测
``` ```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification -d cpu -e single python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
``` ```
## 效果对比 ## 效果对比
...@@ -87,19 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification ...@@ -87,19 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: | | :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- | | ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- | | -- | Classification | -- | -- | -- | -- |
## 分布式
### 模型训练性能 (样本/s)
| 数据集 | 模型 | 单机 | 同步 (4节点) | 同步 (8节点) | 同步 (16节点) | 同步 (32节点) |
| :------------------: | :--------------------: | :---------: |:---------: |:---------: |:---------: |:---------: |
| -- | TagSpace | -- | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- | -- |
----
| 数据集 | 模型 | 单机 | 异步 (4节点) | 异步 (8节点) | 异步 (16节点) | 异步 (32节点) |
| :------------------: | :--------------------: | :---------: |:---------: |:---------: |:---------: |:---------: |
| -- | TagSpace | -- | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- | -- |
...@@ -47,4 +47,3 @@ train: ...@@ -47,4 +47,3 @@ train:
dirname: "inference" dirname: "inference"
epoch_interval: 100 epoch_interval: 100
save_last: True save_last: True
...@@ -26,8 +26,10 @@ class Model(ModelBase): ...@@ -26,8 +26,10 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
self.cost = None self.cost = None
self.metrics = {} self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace) self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace) self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace) self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace) self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace) self.win_size = envs.get_global_env("win_size", None, self._namespace)
...@@ -35,8 +37,9 @@ class Model(ModelBase): ...@@ -35,8 +37,9 @@ class Model(ModelBase):
self.neg_size = envs.get_global_env("neg_size", None, self._namespace) self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
def train_net(self): def train_net(self):
""" network definition """ """ network"""
text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64') text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data( pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data( neg_tag = fluid.data(
...@@ -45,13 +48,19 @@ class Model(ModelBase): ...@@ -45,13 +48,19 @@ class Model(ModelBase):
self._data_var = [text, pos_tag, neg_tag] self._data_var = [text, pos_tag, neg_tag]
text_emb = fluid.embedding( text_emb = fluid.embedding(
input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb") input=text,
size=[self.vocab_text_size, self.emb_dim],
param_attr="text_emb")
text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) text_emb = fluid.layers.squeeze(input=text_emb, axes=[1])
pos_tag_emb = fluid.embedding( pos_tag_emb = fluid.embedding(
input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") input=pos_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1])
neg_tag_emb = fluid.embedding( neg_tag_emb = fluid.embedding(
input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") input=neg_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1])
conv_1d = fluid.nets.sequence_conv_pool( conv_1d = fluid.nets.sequence_conv_pool(
...@@ -65,7 +74,8 @@ class Model(ModelBase): ...@@ -65,7 +74,8 @@ class Model(ModelBase):
size=self.emb_dim, size=self.emb_dim,
param_attr="text_hid") param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid) cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_text_hid = fluid.layers.sequence_expand_as(
x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape( cos_neg_all = fluid.layers.sequence_reshape(
input=mul_cos_neg, new_dim=self.neg_size) input=mul_cos_neg, new_dim=self.neg_size)
...@@ -74,7 +84,10 @@ class Model(ModelBase): ...@@ -74,7 +84,10 @@ class Model(ModelBase):
#calculate hinge loss #calculate hinge loss
loss_part1 = nn.elementwise_sub( loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like( tensor.fill_constant_batch_size_like(
input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'), input=cos_pos,
shape=[-1, 1],
value=self.margin,
dtype='float32'),
cos_pos) cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max( loss_part3 = nn.elementwise_max(
...@@ -85,7 +98,7 @@ class Model(ModelBase): ...@@ -85,7 +98,7 @@ class Model(ModelBase):
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less) correct = nn.reduce_sum(less)
self.cost = avg_cost self.cost = avg_cost
self.metrics["correct"] = correct self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos self.metrics["cos_pos"] = cos_pos
...@@ -96,7 +109,8 @@ class Model(ModelBase): ...@@ -96,7 +109,8 @@ class Model(ModelBase):
return self.metrics return self.metrics
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer return sgd_optimizer
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys import sys
import numpy as np import numpy as np
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -23,13 +23,26 @@ class Model(ModelBase): ...@@ -23,13 +23,26 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def input(self): def input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace) self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) Neg = envs.get_global_env("hyper_parameters.NEG", None,
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self._namespace)
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i
in range(Neg)] self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self.doc_negs = [
fluid.data(
name="doc_neg_" + str(i),
shape=[-1, TRIGRAM_D],
dtype="float32",
lod_level=0) for i in range(Neg)
]
self._data_var.append(self.query) self._data_var.append(self.query)
self._data_var.append(self.doc_pos) self._data_var.append(self.doc_pos)
for input in self.doc_negs: for input in self.doc_negs:
...@@ -37,16 +50,24 @@ class Model(ModelBase): ...@@ -37,16 +50,24 @@ class Model(ModelBase):
if self._platform != "LINUX": if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self, is_infer=False): def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace) self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
self._namespace)
def fc(data, hidden_layers, hidden_acts, names): def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data] fc_inputs = [data]
for i in range(len(hidden_layers)): for i in range(len(hidden_layers)):
xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i]) xavier = fluid.initializer.Xavier(
uniform=True,
fan_in=fc_inputs[-1].shape[1],
fan_out=hidden_layers[i])
out = fluid.layers.fc(input=fc_inputs[-1], out = fluid.layers.fc(input=fc_inputs[-1],
size=hidden_layers[i], size=hidden_layers[i],
act=hidden_acts[i], act=hidden_acts[i],
...@@ -56,8 +77,10 @@ class Model(ModelBase): ...@@ -56,8 +77,10 @@ class Model(ModelBase):
fc_inputs.append(out) fc_inputs.append(out)
return fc_inputs[-1] return fc_inputs[-1]
query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3']) query_fc = fc(self.query, hidden_layers, hidden_acts,
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) ['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc) self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
if is_infer: if is_infer:
...@@ -65,13 +88,17 @@ class Model(ModelBase): ...@@ -65,13 +88,17 @@ class Model(ModelBase):
R_Q_D_ns = [] R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs): for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)]) 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i)) R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1) concat_Rs = fluid.layers.concat(
input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1) prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1]) hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob)) loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss) self.avg_cost = fluid.layers.mean(x=loss)
...@@ -91,18 +118,28 @@ class Model(ModelBase): ...@@ -91,18 +118,28 @@ class Model(ModelBase):
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.SGD(learning_rate) optimizer = fluid.optimizer.SGD(learning_rate)
return optimizer return optimizer
def infer_input(self): def infer_input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self._namespace)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self._infer_data_var = [self.query, self.doc_pos] self._infer_data_var = [self.query, self.doc_pos]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self): def infer_net(self):
self.infer_input() self.infer_input()
......
...@@ -22,4 +22,3 @@ mkdir -p data/train ...@@ -22,4 +22,3 @@ mkdir -p data/train
mkdir -p data/test mkdir -p data/test
python generate_synthetic_data.py python generate_synthetic_data.py
...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs ...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = [] self.all_slots = []
for i in range(self.query_slots): for i in range(self.query_slots):
......
...@@ -21,7 +21,11 @@ class Dataset: ...@@ -21,7 +21,11 @@ class Dataset:
class SyntheticDataset(Dataset): class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000): def __init__(self,
sparse_feature_dim,
query_slot_num,
title_slot_num,
dataset_size=10000):
# ids are randomly generated # ids are randomly generated
self.ids_per_slot = 10 self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim self.sparse_feature_dim = sparse_feature_dim
...@@ -46,14 +50,20 @@ class SyntheticDataset(Dataset): ...@@ -46,14 +50,20 @@ class SyntheticDataset(Dataset):
for i in range(self.title_slot_num): for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot, pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim) self.sparse_feature_dim)
pt_slot = [str(fea) + ':' + str(i + self.query_slot_num) for fea in pt_slot] pt_slot = [
str(fea) + ':' + str(i + self.query_slot_num)
for fea in pt_slot
]
pos_title_slots += pt_slot pos_title_slots += pt_slot
if is_train: if is_train:
for i in range(self.title_slot_num): for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot, nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim) self.sparse_feature_dim)
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in nt_slot = [
nt_slot] str(fea) + ':' +
str(i + self.query_slot_num + self.title_slot_num)
for fea in nt_slot
]
neg_title_slots += nt_slot neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots yield query_slots + pos_title_slots + neg_title_slots
else: else:
...@@ -76,7 +86,8 @@ if __name__ == '__main__': ...@@ -76,7 +86,8 @@ if __name__ == '__main__':
query_slots = 1 query_slots = 1
title_slots = 1 title_slots = 1
dataset_size = 10 dataset_size = 10
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size) dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
dataset_size)
train_reader = dataset.train() train_reader = dataset.train()
test_reader = dataset.test() test_reader = dataset.test()
......
...@@ -103,12 +103,18 @@ class Model(ModelBase): ...@@ -103,12 +103,18 @@ class Model(ModelBase):
def init_config(self): def init_config(self):
self._fetch_interval = 1 self._fetch_interval = 1
query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace) query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace) None, self._namespace)
query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace) title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim", None, self._namespace) None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None, self._namespace) query_encode_dim = envs.get_global_env(
title_slots = envs.get_global_env("hyper_parameters.title_slots", None, self._namespace) "hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
self._namespace)
factory = SimpleEncoderFactory() factory = SimpleEncoderFactory()
self.query_encoders = [ self.query_encoders = [
factory.create(query_encoder, query_encode_dim) factory.create(query_encoder, query_encode_dim)
...@@ -119,10 +125,13 @@ class Model(ModelBase): ...@@ -119,10 +125,13 @@ class Model(ModelBase):
for i in range(title_slots) for i in range(title_slots)
] ]
self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) self.emb_size = envs.get_global_env(
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) "hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
None, self._namespace)
self.emb_shape = [self.emb_size, self.emb_dim] self.emb_shape = [self.emb_size, self.emb_dim]
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
None, self._namespace)
self.margin = 0.1 self.margin = 0.1
def input(self, is_train=True): def input(self, is_train=True):
...@@ -133,8 +142,10 @@ class Model(ModelBase): ...@@ -133,8 +142,10 @@ class Model(ModelBase):
] ]
self.pt_slots = [ self.pt_slots = [
fluid.data( fluid.data(
name="%d" % (i + len(self.query_encoders)), shape=[None, 1], lod_level=1, dtype='int64') name="%d" % (i + len(self.query_encoders)),
for i in range(len(self.title_encoders)) shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
] ]
if is_train == False: if is_train == False:
...@@ -142,9 +153,11 @@ class Model(ModelBase): ...@@ -142,9 +153,11 @@ class Model(ModelBase):
self.nt_slots = [ self.nt_slots = [
fluid.data( fluid.data(
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, name="%d" %
dtype='int64') (i + len(self.query_encoders) + len(self.title_encoders)),
for i in range(len(self.title_encoders)) shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
] ]
return self.q_slots + self.pt_slots + self.nt_slots return self.q_slots + self.pt_slots + self.nt_slots
...@@ -153,11 +166,15 @@ class Model(ModelBase): ...@@ -153,11 +166,15 @@ class Model(ModelBase):
res = self.input() res = self.input()
self._data_var = res self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader: if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def get_acc(self, x, y): def get_acc(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32') less = tensor.cast(cf.less_than(x, y), dtype='float32')
...@@ -190,10 +207,12 @@ class Model(ModelBase): ...@@ -190,10 +207,12 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
] ]
pt_encodes = [ pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
] ]
nt_encodes = [ nt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs) self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
] ]
# concat multi view for query, pos_title, neg_title # concat multi view for query, pos_title, neg_title
...@@ -252,7 +271,8 @@ class Model(ModelBase): ...@@ -252,7 +271,8 @@ class Model(ModelBase):
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
return optimizer return optimizer
...@@ -261,7 +281,10 @@ class Model(ModelBase): ...@@ -261,7 +281,10 @@ class Model(ModelBase):
self._infer_data_var = res self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self): def infer_net(self):
self.infer_input() self.infer_input()
...@@ -281,7 +304,8 @@ class Model(ModelBase): ...@@ -281,7 +304,8 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
] ]
pt_encodes = [ pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
] ]
# concat multi view for query, pos_title, neg_title # concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes) q_concat = fluid.layers.concat(q_encodes)
......
...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs ...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = [] self.all_slots = []
for i in range(self.query_slots): for i in range(self.query_slots):
......
...@@ -37,4 +37,3 @@ ...@@ -37,4 +37,3 @@
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
``` ```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader ...@@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', all_field_id = [
'129', '101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] '125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int) self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id): for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i] self.all_field_id_dict[field_id] = [False, i]
......
...@@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader ...@@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', all_field_id = [
'129', '101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] '125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int) self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id): for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i] self.all_field_id_dict[field_id] = [False, i]
......
...@@ -28,11 +28,13 @@ class Model(ModelBase): ...@@ -28,11 +28,13 @@ class Model(ModelBase):
init_stddev = 1.0 init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1]) scales = 1.0 / np.sqrt(data.shape[1])
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, p_attr = fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(loc=0.0, name='%s_weight' % tag,
scale=init_stddev * scales)) initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data, out = fluid.layers.fc(input=data,
size=out_dim, size=out_dim,
...@@ -44,7 +46,11 @@ class Model(ModelBase): ...@@ -44,7 +46,11 @@ class Model(ModelBase):
def input_data(self): def input_data(self):
sparse_input_ids = [ sparse_input_ids = [
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23) fluid.data(
name="field_" + str(i),
shape=[-1, 1],
dtype="int64",
lod_level=1) for i in range(0, 23)
] ]
label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64") label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64")
label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64") label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64")
...@@ -55,19 +61,23 @@ class Model(ModelBase): ...@@ -55,19 +61,23 @@ class Model(ModelBase):
def net(self, inputs, is_infer=False): def net(self, inputs, is_infer=False):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace) self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None,
self._namespace)
emb = [] emb = []
for data in inputs[0:-2]: for data in inputs[0:-2]:
feat_emb = fluid.embedding(input=data, feat_emb = fluid.embedding(
size=[vocab_size, embed_size], input=data,
param_attr=fluid.ParamAttr(name='dis_emb', size=[vocab_size, embed_size],
learning_rate=5, param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Xavier( name='dis_emb',
fan_in=embed_size, fan_out=embed_size) learning_rate=5,
), initializer=fluid.initializer.Xavier(
is_sparse=True) fan_in=embed_size, fan_out=embed_size)),
field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum') is_sparse=True)
field_emb = fluid.layers.sequence_pool(
input=feat_emb, pool_type='sum')
emb.append(field_emb) emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1) concat_emb = fluid.layers.concat(emb, axis=1)
...@@ -85,14 +95,20 @@ class Model(ModelBase): ...@@ -85,14 +95,20 @@ class Model(ModelBase):
ctr_clk = inputs[-2] ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1] ctcvr_buy = inputs[-1]
ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2]) ctr_prop_one = fluid.layers.slice(
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(
cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one,
ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1) cvr_prop_one)
ctcvr_prop = fluid.layers.concat(
input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(
input=ctcvr_prop, label=ctcvr_buy)
if is_infer: if is_infer:
self._infer_results["AUC_ctr"] = auc_ctr self._infer_results["AUC_ctr"] = auc_ctr
...@@ -100,7 +116,8 @@ class Model(ModelBase): ...@@ -100,7 +116,8 @@ class Model(ModelBase):
return return
loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) loss_ctcvr = fluid.layers.cross_entropy(
input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
...@@ -117,5 +134,8 @@ class Model(ModelBase): ...@@ -117,5 +134,8 @@ class Model(ModelBase):
def infer_net(self): def infer_net(self):
self._infer_data_var = self.input_data() self._infer_data_var = self.input_data()
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True) self.net(self._infer_data_var, is_infer=True)
...@@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader ...@@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
pass pass
def generate_sample(self, line): def generate_sample(self, line):
......
...@@ -24,6 +24,7 @@ class TrainReader(Reader): ...@@ -24,6 +24,7 @@ class TrainReader(Reader):
def generate_sample(self, line): def generate_sample(self, line):
""" """
Read the data line by line and process it as a dictionary Read the data line by line and process it as a dictionary
""" """
def reader(): def reader():
......
...@@ -23,44 +23,58 @@ class Model(ModelBase): ...@@ -23,44 +23,58 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def MMOE(self, is_infer=False): def MMOE(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size",
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace) None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None, self._namespace) expert_num = envs.get_global_env("hyper_parameters.expert_num", None,
gate_num = envs.get_global_env("hyper_parameters.gate_num", None, self._namespace) self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None, self._namespace) gate_num = envs.get_global_env("hyper_parameters.gate_num", None,
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace) self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None,
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") self._namespace)
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer: if is_infer:
self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital]) self._data_var.extend([input_data, label_income, label_marital])
# f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper # f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper
expert_outputs = [] expert_outputs = []
for i in range(0, expert_num): for i in range(0, expert_num):
expert_output = fluid.layers.fc(input=input_data, expert_output = fluid.layers.fc(
size=expert_size, input=input_data,
act='relu', size=expert_size,
bias_attr=fluid.ParamAttr(learning_rate=1.0), act='relu',
name='expert_' + str(i)) bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_outputs.append(expert_output) expert_outputs.append(expert_output)
expert_concat = fluid.layers.concat(expert_outputs, axis=1) expert_concat = fluid.layers.concat(expert_outputs, axis=1)
expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size]) expert_concat = fluid.layers.reshape(expert_concat,
[-1, expert_num, expert_size])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper # g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers = [] output_layers = []
for i in range(0, gate_num): for i in range(0, gate_num):
cur_gate = fluid.layers.fc(input=input_data, cur_gate = fluid.layers.fc(
size=expert_num, input=input_data,
act='softmax', size=expert_num,
bias_attr=fluid.ParamAttr(learning_rate=1.0), act='softmax',
name='gate_' + str(i)) bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
# f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x)) # f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x))
cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0) cur_gate_expert = fluid.layers.elementwise_mul(
expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1) cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1)
# Build tower layer # Build tower layer
cur_tower = fluid.layers.fc(input=cur_gate_expert, cur_tower = fluid.layers.fc(input=cur_gate_expert,
...@@ -74,25 +88,33 @@ class Model(ModelBase): ...@@ -74,25 +88,33 @@ class Model(ModelBase):
output_layers.append(out) output_layers.append(out)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_income = fluid.layers.clip(
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
label_income_1 = fluid.layers.slice(
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label_income, axes=[1], starts=[1], ends=[2])
label=fluid.layers.cast(x=label_income_1, label_marital_1 = fluid.layers.slice(
dtype='int64')) label_marital, axes=[1], starts=[1], ends=[2])
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1, auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
dtype='int64')) input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer: if is_infer:
self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital self._infer_results["AUC_marital"] = auc_marital
return return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) cost_income = fluid.layers.cross_entropy(
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
avg_cost_income = fluid.layers.mean(x=cost_income) avg_cost_income = fluid.layers.mean(x=cost_income)
avg_cost_marital = fluid.layers.mean(x=cost_marital) avg_cost_marital = fluid.layers.mean(x=cost_marital)
......
...@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm ...@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 | | Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 | | Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 | | Ali-CCP | ESMM | -- | 0.97181/0.49967 |
...@@ -24,27 +24,38 @@ class Model(ModelBase): ...@@ -24,27 +24,38 @@ class Model(ModelBase):
def model(self, is_infer=False): def model(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace) feature_size = envs.get_global_env("hyper_parameters.feature_size",
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None, self._namespace) None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace) bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None,
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None, self._namespace) self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") self._namespace)
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None,
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer: if is_infer:
self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital]) self._data_var.extend([input_data, label_income, label_marital])
bottom_output = fluid.layers.fc(input=input_data, bottom_output = fluid.layers.fc(
size=bottom_size, input=input_data,
act='relu', size=bottom_size,
bias_attr=fluid.ParamAttr(learning_rate=1.0), act='relu',
name='bottom_output') bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
# Build tower layer from bottom layer # Build tower layer from bottom layer
output_layers = [] output_layers = []
...@@ -59,26 +70,34 @@ class Model(ModelBase): ...@@ -59,26 +70,34 @@ class Model(ModelBase):
name='output_layer_' + str(index)) name='output_layer_' + str(index))
output_layers.append(output_layer) output_layers.append(output_layer)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_income = fluid.layers.clip(
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
label_income_1 = fluid.layers.slice(
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label_income, axes=[1], starts=[1], ends=[2])
label=fluid.layers.cast(x=label_income_1, label_marital_1 = fluid.layers.slice(
dtype='int64')) label_marital, axes=[1], starts=[1], ends=[2])
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1, auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
dtype='int64')) input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer: if is_infer:
self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital self._infer_results["AUC_marital"] = auc_marital
return return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) cost_income = fluid.layers.cross_entropy(
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1) cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -22,9 +22,10 @@ train: ...@@ -22,9 +22,10 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
class: "{workspace}/criteo_reader.py" train_data_path: "{workspace}/data/slot_train"
train_data_path: "{workspace}/data/train"
feat_dict_name: "{workspace}/data/vocab" feat_dict_name: "{workspace}/data/vocab"
sparse_slots: "label C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26"
dense_slots: "I1:1 I2:1 I3:1 I4:1 I5:1 I6:1 I7:1 I8:1 I9:1 I10:1 I11:1 I12:1 I13:1"
model: model:
models: "{workspace}/model.py" models: "{workspace}/model.py"
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import sys import sys
import io import io
......
...@@ -11,21 +11,32 @@ ...@@ -11,21 +11,32 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import math
from __future__ import print_function import sys
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import math import math
import os import os
try: try:
import cPickle as pickle import cPickle as pickle
except ImportError: except ImportError:
import pickle import pickle
from collections import Counter
import os
import paddle.fluid.incubate.data_generator as dg
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
class TrainReader(Reader):
def init(self): def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [ self.cont_max_ = [
...@@ -48,7 +59,7 @@ class TrainReader(Reader): ...@@ -48,7 +59,7 @@ class TrainReader(Reader):
self.cat_feat_idx_dict_list = [{} for _ in range(26)] self.cat_feat_idx_dict_list = [{} for _ in range(26)]
# TODO: set vocabulary dictionary # TODO: set vocabulary dictionary
vocab_dir = envs.get_global_env("feat_dict_name", None, "train.reader") vocab_dir = "./vocab/"
for i in range(26): for i in range(26):
lookup_idx = 1 # remain 0 for default value lookup_idx = 1 # remain 0 for default value
for line in open( for line in open(
...@@ -72,11 +83,11 @@ class TrainReader(Reader): ...@@ -72,11 +83,11 @@ class TrainReader(Reader):
if idx == 2 else math.log(1 + float(features[idx]))) if idx == 2 else math.log(1 + float(features[idx])))
for idx in self.cat_idx_: for idx in self.cat_idx_:
if features[idx] == '' or features[ if features[idx] == '' or features[
idx] not in self.cat_feat_idx_dict_list[idx - 14]: idx] not in self.cat_feat_idx_dict_list[idx - 14]:
label_feat_list[idx].append(0) label_feat_list[idx].append(0)
else: else:
label_feat_list[idx].append(self.cat_feat_idx_dict_list[ label_feat_list[idx].append(self.cat_feat_idx_dict_list[
idx - 14][features[idx]]) idx - 14][features[idx]])
label_feat_list[0].append(int(features[0])) label_feat_list[0].append(int(features[0]))
return label_feat_list return label_feat_list
...@@ -87,6 +98,18 @@ class TrainReader(Reader): ...@@ -87,6 +98,18 @@ class TrainReader(Reader):
def data_iter(): def data_iter():
label_feat_list = self._process_line(line) label_feat_list = self._process_line(line)
yield list(zip(self.label_feat_names, label_feat_list)) s = ""
for i in list(zip(self.label_feat_names, label_feat_list)):
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, absolute_import, division from __future__ import print_function, absolute_import, division
import os import os
......
python download.py
python preprocess.py
mkdir slot_train
for i in `ls ./train`
do
cat train/$i | python get_slot_data.py > slot_train/$i
done
mkdir slot_test_valid
for i in `ls ./test_valid`
do
cat test_valid/$i | python get_slot_data.py > slot_test_valid/$i
done
...@@ -25,12 +25,23 @@ class Model(ModelBase): ...@@ -25,12 +25,23 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def init_network(self): def init_network(self):
self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace) self.cross_num = envs.get_global_env("hyper_parameters.cross_num",
self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace) None, self._namespace)
self.l2_reg_cross = envs.get_global_env("hyper_parameters.l2_reg_cross", None, self._namespace) self.dnn_hidden_units = envs.get_global_env(
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", None, self._namespace) "hyper_parameters.dnn_hidden_units", None, self._namespace)
self.clip_by_norm = envs.get_global_env("hyper_parameters.clip_by_norm", None, self._namespace) self.l2_reg_cross = envs.get_global_env(
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", None, self._namespace) "hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn",
None, self._namespace)
self.clip_by_norm = envs.get_global_env(
"hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num",
None, self._namespace)
self.sparse_inputs = self._sparse_data_var[1:]
self.dense_inputs = self._dense_data_var
self.target_input = self._sparse_data_var[0]
cat_feat_dims_dict = OrderedDict() cat_feat_dims_dict = OrderedDict()
for line in open(cat_feat_num): for line in open(cat_feat_num):
spls = line.strip().split() spls = line.strip().split()
...@@ -38,10 +49,11 @@ class Model(ModelBase): ...@@ -38,10 +49,11 @@ class Model(ModelBase):
cat_feat_dims_dict[spls[0]] = int(spls[1]) cat_feat_dims_dict[spls[0]] = int(spls[1])
self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict( self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict(
) )
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", None, self._namespace) self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
None, self._namespace)
self.dense_feat_names = ['I' + str(i) for i in range(1, 14)] self.dense_feat_names = [i.name for i in self.dense_inputs]
self.sparse_feat_names = ['C' + str(i) for i in range(1, 27)] self.sparse_feat_names = [i.name for i in self.sparse_inputs]
# {feat_name: dims} # {feat_name: dims}
self.feat_dims_dict = OrderedDict( self.feat_dims_dict = OrderedDict(
...@@ -51,21 +63,20 @@ class Model(ModelBase): ...@@ -51,21 +63,20 @@ class Model(ModelBase):
self.net_input = None self.net_input = None
self.loss = None self.loss = None
def _create_embedding_input(self, data_dict): def _create_embedding_input(self):
# sparse embedding # sparse embedding
sparse_emb_dict = OrderedDict((name, fluid.embedding( sparse_emb_dict = OrderedDict()
input=fluid.layers.cast( for var in self.sparse_inputs:
data_dict[name], dtype='int64'), sparse_emb_dict[var.name] = fluid.embedding(
size=[ input=var,
self.feat_dims_dict[name] + 1, size=[
6 * int(pow(self.feat_dims_dict[name], 0.25)) self.feat_dims_dict[var.name] + 1,
], 6 * int(pow(self.feat_dims_dict[var.name], 0.25))
is_sparse=self.is_sparse)) for name in self.sparse_feat_names) ],
is_sparse=self.is_sparse)
# combine dense and sparse_emb # combine dense and sparse_emb
dense_input_list = [ dense_input_list = self.dense_inputs
data_dict[name] for name in data_dict if name.startswith('I')
]
sparse_emb_list = list(sparse_emb_dict.values()) sparse_emb_list = list(sparse_emb_dict.values())
sparse_input = fluid.layers.concat(sparse_emb_list, axis=-1) sparse_input = fluid.layers.concat(sparse_emb_list, axis=-1)
...@@ -111,17 +122,13 @@ class Model(ModelBase): ...@@ -111,17 +122,13 @@ class Model(ModelBase):
return fluid.layers.reduce_sum(fluid.layers.square(w)) return fluid.layers.reduce_sum(fluid.layers.square(w))
def train_net(self): def train_net(self):
self.model._init_slots()
self.init_network() self.init_network()
self.target_input = fluid.data(
name='label', shape=[None, 1], dtype='float32')
data_dict = OrderedDict()
for feat_name in self.feat_dims_dict:
data_dict[feat_name] = fluid.data(
name=feat_name, shape=[None, 1], dtype='float32')
self.net_input = self._create_embedding_input(data_dict) self.net_input = self._create_embedding_input()
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False) deep_out = self._deep_net(self.net_input, self.dnn_hidden_units,
self.dnn_use_bn, False)
cross_out, l2_reg_cross_loss = self._cross_net(self.net_input, cross_out, l2_reg_cross_loss = self._cross_net(self.net_input,
self.cross_num) self.cross_num)
...@@ -130,9 +137,6 @@ class Model(ModelBase): ...@@ -130,9 +137,6 @@ class Model(ModelBase):
logit = fluid.layers.fc(last_out, 1) logit = fluid.layers.fc(last_out, 1)
self.prob = fluid.layers.sigmoid(logit) self.prob = fluid.layers.sigmoid(logit)
self._data_var = [self.target_input] + [
data_dict[dense_name] for dense_name in self.dense_feat_names
] + [data_dict[sparse_name] for sparse_name in self.sparse_feat_names]
# auc # auc
prob_2d = fluid.layers.concat([1 - self.prob, self.prob], 1) prob_2d = fluid.layers.concat([1 - self.prob, self.prob], 1)
...@@ -143,7 +147,9 @@ class Model(ModelBase): ...@@ -143,7 +147,9 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var self._metrics["BATCH_AUC"] = batch_auc_var
# logloss # logloss
logloss = fluid.layers.log_loss(self.prob, self.target_input) logloss = fluid.layers.log_loss(
self.prob, fluid.layers.cast(
self.target_input, dtype='float32'))
self.avg_logloss = fluid.layers.reduce_mean(logloss) self.avg_logloss = fluid.layers.reduce_mean(logloss)
# reg_coeff * l2_reg_cross # reg_coeff * l2_reg_cross
...@@ -152,9 +158,11 @@ class Model(ModelBase): ...@@ -152,9 +158,11 @@ class Model(ModelBase):
self._cost = self.loss self._cost = self.loss
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
def infer_net(self, parameter_list): def infer_net(self, parameter_list):
self.model._init_slots()
self.deepfm_net() self.deepfm_net()
...@@ -22,9 +22,10 @@ train: ...@@ -22,9 +22,10 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
class: "{workspace}/criteo_reader.py" train_data_path: "{workspace}/data/slot_train_data"
train_data_path: "{workspace}/data/train_data" feat_dict_name: "{workspace}/data/feat_dict_10.pkl2"
feat_dict_name: "{workspace}/data/aid_data/feat_dict_10.pkl2" sparse_slots: "label feat_idx"
dense_slots: "feat_value:39"
model: model:
models: "{workspace}/model.py" models: "{workspace}/model.py"
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import shutil import shutil
import sys import sys
......
...@@ -12,18 +12,25 @@ ...@@ -12,18 +12,25 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try: try:
import cPickle as pickle import cPickle as pickle
except ImportError: except ImportError:
import pickle import pickle
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
class TrainReader(Reader):
def init(self): def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [ self.cont_max_ = [
...@@ -37,7 +44,7 @@ class TrainReader(Reader): ...@@ -37,7 +44,7 @@ class TrainReader(Reader):
self.continuous_range_ = range(1, 14) self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40) self.categorical_range_ = range(14, 40)
# load preprocessed feature dict # load preprocessed feature dict
self.feat_dict_name = envs.get_global_env("feat_dict_name", None, "train.reader") self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line): def _process_line(self, line):
...@@ -70,6 +77,19 @@ class TrainReader(Reader): ...@@ -70,6 +77,19 @@ class TrainReader(Reader):
def data_iter(): def data_iter():
feat_idx, feat_value, label = self._process_line(line) feat_idx, feat_value, label = self._process_line(line)
yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)] s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import numpy import numpy
from collections import Counter from collections import Counter
......
python download_preprocess.py
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
...@@ -27,31 +27,26 @@ class Model(ModelBase): ...@@ -27,31 +27,26 @@ class Model(ModelBase):
def deepfm_net(self): def deepfm_net(self):
init_value_ = 0.1 init_value_ = 0.1
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_number = envs.get_global_env(
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) "hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input -------------------------- # ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) num_field = envs.get_global_env("hyper_parameters.num_field", None,
raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field], self._namespace)
dtype='int64') # None * num_field(defalut:39)
raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32') # None * num_field
self.label = fluid.data(name='label', shape=[None, 1], dtype='float32') # None * 1
feat_idx = fluid.layers.reshape(raw_feat_idx, [-1, 1]) # (None * num_field) * 1
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
# ------------------------- set _data_var -------------------------- raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
self._data_var.append(raw_feat_idx) feat_idx = raw_feat_idx
self._data_var.append(raw_feat_value) feat_value = fluid.layers.reshape(
self._data_var.append(self.label) raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
# ------------------------- first order term -------------------------- reg = envs.get_global_env("hyper_parameters.reg", 1e-4,
self._namespace)
reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace)
first_weights_re = fluid.embedding( first_weights_re = fluid.embedding(
input=feat_idx, input=feat_idx,
is_sparse=True, is_sparse=True,
...@@ -65,7 +60,8 @@ class Model(ModelBase): ...@@ -65,7 +60,8 @@ class Model(ModelBase):
regularizer=fluid.regularizer.L1DecayRegularizer(reg))) regularizer=fluid.regularizer.L1DecayRegularizer(reg)))
first_weights = fluid.layers.reshape( first_weights = fluid.layers.reshape(
first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1 first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1) y_first_order = fluid.layers.reduce_sum((first_weights * feat_value),
1)
# ------------------------- second order term -------------------------- # ------------------------- second order term --------------------------
...@@ -78,7 +74,8 @@ class Model(ModelBase): ...@@ -78,7 +74,8 @@ class Model(ModelBase):
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormalInitializer( initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_ / math.sqrt(float(sparse_feature_dim))))) loc=0.0,
scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
feat_embeddings = fluid.layers.reshape( feat_embeddings = fluid.layers.reshape(
feat_embeddings_re, feat_embeddings_re,
shape=[-1, num_field, shape=[-1, num_field,
...@@ -86,8 +83,8 @@ class Model(ModelBase): ...@@ -86,8 +83,8 @@ class Model(ModelBase):
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# sum_square part # sum_square part
summed_features_emb = fluid.layers.reduce_sum(feat_embeddings, summed_features_emb = fluid.layers.reduce_sum(
1) # None * embedding_size feat_embeddings, 1) # None * embedding_size
summed_features_emb_square = fluid.layers.square( summed_features_emb_square = fluid.layers.square(
summed_features_emb) # None * embedding_size summed_features_emb) # None * embedding_size
...@@ -98,13 +95,16 @@ class Model(ModelBase): ...@@ -98,13 +95,16 @@ class Model(ModelBase):
squared_features_emb, 1) # None * embedding_size squared_features_emb, 1) # None * embedding_size
y_second_order = 0.5 * fluid.layers.reduce_sum( y_second_order = 0.5 * fluid.layers.reduce_sum(
summed_features_emb_square - squared_sum_features_emb, 1, summed_features_emb_square - squared_sum_features_emb,
1,
keep_dim=True) # None * 1 keep_dim=True) # None * 1
# ------------------------- DNN -------------------------- # ------------------------- DNN --------------------------
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None,
act = envs.get_global_env("hyper_parameters.act", None, self._namespace) self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings, y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim]) [-1, num_field * sparse_feature_dim])
for s in layer_sizes: for s in layer_sizes:
...@@ -131,14 +131,17 @@ class Model(ModelBase): ...@@ -131,14 +131,17 @@ class Model(ModelBase):
# ------------------------- DeepFM -------------------------- # ------------------------- DeepFM --------------------------
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) self.predict = fluid.layers.sigmoid(y_first_order + y_second_order +
y_dnn)
def train_net(self): def train_net(self):
self.model._init_slots()
self.deepfm_net() self.deepfm_net()
# ------------------------- Cost(logloss) -------------------------- # ------------------------- Cost(logloss) --------------------------
cost = fluid.layers.log_loss(input=self.predict, label=self.label) cost = fluid.layers.log_loss(
input=self.predict, label=fluid.layers.cast(self.label, "float32"))
avg_cost = fluid.layers.reduce_sum(cost) avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost self._cost = avg_cost
...@@ -154,9 +157,11 @@ class Model(ModelBase): ...@@ -154,9 +157,11 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
def infer_net(self, parameter_list): def infer_net(self, parameter_list):
self.model._init_slots()
self.deepfm_net() self.deepfm_net()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function from __future__ import print_function
import random import random
import pickle import pickle
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function from __future__ import print_function
import pickle import pickle
import pandas as pd import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function from __future__ import print_function
import random import random
import pickle import pickle
......
...@@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase ...@@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase
class Model(ModelBase): class Model(ModelBase):
def __init__(self, config): def __init__(self, config):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def config_read(self, config_path): def config_read(self, config_path):
with open(config_path, "r") as fin: with open(config_path, "r") as fin:
user_count = int(fin.readline().strip()) user_count = int(fin.readline().strip())
item_count = int(fin.readline().strip()) item_count = int(fin.readline().strip())
cat_count = int(fin.readline().strip()) cat_count = int(fin.readline().strip())
return user_count, item_count, cat_count return user_count, item_count, cat_count
def din_attention(self, hist, target_expand, mask): def din_attention(self, hist, target_expand, mask):
"""activation weight""" """activation weight"""
...@@ -58,56 +58,66 @@ class Model(ModelBase): ...@@ -58,56 +58,66 @@ class Model(ModelBase):
out = fluid.layers.matmul(weight, hist) out = fluid.layers.matmul(weight, hist)
out = fluid.layers.reshape(x=out, shape=[0, hidden_size]) out = fluid.layers.reshape(x=out, shape=[0, hidden_size])
return out return out
def train_net(self): def train_net(self):
seq_len = -1 seq_len = -1
self.item_emb_size = envs.get_global_env("hyper_parameters.item_emb_size", 64, self._namespace) self.item_emb_size = envs.get_global_env(
self.cat_emb_size = envs.get_global_env("hyper_parameters.cat_emb_size", 64, self._namespace) "hyper_parameters.item_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", self._namespace) self.cat_emb_size = envs.get_global_env(
"hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid",
self._namespace)
#item_emb_size = 64 #item_emb_size = 64
#cat_emb_size = 64 #cat_emb_size = 64
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", False, self._namespace) self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False, self._namespace)
#significant for speeding up the training process #significant for speeding up the training process
self.config_path = envs.get_global_env("hyper_parameters.config_path", "data/config.txt", self._namespace) self.config_path = envs.get_global_env(
self.use_DataLoader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) "hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env(
"hyper_parameters.use_DataLoader", False, self._namespace)
user_count, item_count, cat_count = self.config_read(self.config_path) user_count, item_count, cat_count = self.config_read(self.config_path)
item_emb_attr = fluid.ParamAttr(name="item_emb") item_emb_attr = fluid.ParamAttr(name="item_emb")
cat_emb_attr = fluid.ParamAttr(name="cat_emb") cat_emb_attr = fluid.ParamAttr(name="cat_emb")
hist_item_seq = fluid.data( hist_item_seq = fluid.data(
name="hist_item_seq", shape=[None, seq_len], dtype="int64") name="hist_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_item_seq) self._data_var.append(hist_item_seq)
hist_cat_seq = fluid.data( hist_cat_seq = fluid.data(
name="hist_cat_seq", shape=[None, seq_len], dtype="int64") name="hist_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_cat_seq) self._data_var.append(hist_cat_seq)
target_item = fluid.data(name="target_item", shape=[None], dtype="int64") target_item = fluid.data(
name="target_item", shape=[None], dtype="int64")
self._data_var.append(target_item) self._data_var.append(target_item)
target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64") target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64")
self._data_var.append(target_cat) self._data_var.append(target_cat)
label = fluid.data(name="label", shape=[None, 1], dtype="float32") label = fluid.data(name="label", shape=[None, 1], dtype="float32")
self._data_var.append(label) self._data_var.append(label)
mask = fluid.data(name="mask", shape=[None, seq_len, 1], dtype="float32") mask = fluid.data(
name="mask", shape=[None, seq_len, 1], dtype="float32")
self._data_var.append(mask) self._data_var.append(mask)
target_item_seq = fluid.data( target_item_seq = fluid.data(
name="target_item_seq", shape=[None, seq_len], dtype="int64") name="target_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_item_seq) self._data_var.append(target_item_seq)
target_cat_seq = fluid.data( target_cat_seq = fluid.data(
name="target_cat_seq", shape=[None, seq_len], dtype="int64") name="target_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_cat_seq) self._data_var.append(target_cat_seq)
if self.use_DataLoader: if self.use_DataLoader:
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=10000, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=10000,
use_double_buffer=False,
iterable=False)
hist_item_emb = fluid.embedding( hist_item_emb = fluid.embedding(
input=hist_item_seq, input=hist_item_seq,
size=[item_count, self.item_emb_size], size=[item_count, self.item_emb_size],
...@@ -149,7 +159,8 @@ class Model(ModelBase): ...@@ -149,7 +159,8 @@ class Model(ModelBase):
size=[item_count, 1], size=[item_count, 1],
param_attr=fluid.initializer.Constant(value=0.0)) param_attr=fluid.initializer.Constant(value=0.0))
hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2) hist_seq_concat = fluid.layers.concat(
[hist_item_emb, hist_cat_emb], axis=2)
target_seq_concat = fluid.layers.concat( target_seq_concat = fluid.layers.concat(
[target_item_seq_emb, target_cat_seq_emb], axis=2) [target_item_seq_emb, target_cat_seq_emb], axis=2)
target_concat = fluid.layers.concat( target_concat = fluid.layers.concat(
...@@ -157,21 +168,22 @@ class Model(ModelBase): ...@@ -157,21 +168,22 @@ class Model(ModelBase):
out = self.din_attention(hist_seq_concat, target_seq_concat, mask) out = self.din_attention(hist_seq_concat, target_seq_concat, mask)
out_fc = fluid.layers.fc(name="out_fc", out_fc = fluid.layers.fc(name="out_fc",
input=out, input=out,
size=self.item_emb_size + self.cat_emb_size, size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1) num_flatten_dims=1)
embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1) embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1)
fc1 = fluid.layers.fc(name="fc1", fc1 = fluid.layers.fc(name="fc1",
input=embedding_concat, input=embedding_concat,
size=80, size=80,
act=self.act) act=self.act)
fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act) fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act)
fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1) fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1)
logit = fc3 + item_b logit = fc3 + item_b
loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label) loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logit, label=label)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
self._cost = avg_loss self._cost = avg_loss
...@@ -179,14 +191,14 @@ class Model(ModelBase): ...@@ -179,14 +191,14 @@ class Model(ModelBase):
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(label, 'int64') label_int = fluid.layers.cast(label, 'int64')
auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
label=label_int, label=label_int,
slide_steps=0) slide_steps=0)
self._metrics["AUC"] = auc_var self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
......
...@@ -29,13 +29,15 @@ from paddlerec.core.utils import envs ...@@ -29,13 +29,15 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader") self.train_data_path = envs.get_global_env("train_data_path", None,
"train.reader")
self.res = [] self.res = []
self.max_len = 0 self.max_len = 0
data_file_list = os.listdir(self.train_data_path) data_file_list = os.listdir(self.train_data_path)
for i in range(0, len(data_file_list)): for i in range(0, len(data_file_list)):
train_data_file = os.path.join(self.train_data_path, data_file_list[i]) train_data_file = os.path.join(self.train_data_path,
data_file_list[i])
with open(train_data_file, "r") as fin: with open(train_data_file, "r") as fin:
for line in fin: for line in fin:
line = line.strip().split(';') line = line.strip().split(';')
...@@ -78,11 +80,13 @@ class TrainReader(Reader): ...@@ -78,11 +80,13 @@ class TrainReader(Reader):
len_array = [len(x[0]) for x in b] len_array = [len(x[0]) for x in b]
mask = np.array( mask = np.array(
[[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape( [[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape(
[-1, max_len, 1]) [-1, max_len, 1])
target_item_seq = np.array( target_item_seq = np.array(
[[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) [[x[2]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
target_cat_seq = np.array( target_cat_seq = np.array(
[[x[3]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) [[x[3]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
res = [] res = []
for i in range(len(b)): for i in range(len(b)):
res.append([ res.append([
...@@ -127,4 +131,5 @@ class TrainReader(Reader): ...@@ -127,4 +131,5 @@ class TrainReader(Reader):
def generate_batch_from_trainfiles(self, files): def generate_batch_from_trainfiles(self, files):
data_set = self.base_read(files) data_set = self.base_read(files)
random.shuffle(data_set) random.shuffle(data_set)
return self.batch_reader(data_set, self.batch_size, self.batch_size * 20) return self.batch_reader(data_set, self.batch_size,
self.batch_size * 20)
...@@ -23,9 +23,10 @@ train: ...@@ -23,9 +23,10 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
class: "{workspace}/../criteo_reader.py" train_data_path: "{workspace}/data/slot_train_data"
train_data_path: "{workspace}/data/train"
reader_debug_mode: False reader_debug_mode: False
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13"
model: model:
models: "{workspace}/model.py" models: "{workspace}/model.py"
......
wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
tar -zxvf ctr_data.tar.gz
mv ./raw_data ./train_data_full
mkdir train_data && cd train_data
cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd ..
mv ./test_data ./test_data_full
mkdir test_data && cd test_data
cp ../test_data_full/part-220 ./ && cd ..
echo "Complete data download."
echo "Full Train data stored in ./train_data_full "
echo "Full Test data stored in ./test_data_full "
echo "Rapid Verification train data stored in ./train_data "
echo "Rapid Verification test data stored in ./test_data "
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,20 +12,21 @@ ...@@ -12,20 +12,21 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function import paddle.fluid.incubate.data_generator as dg
from paddlerec.core.reader import Reader cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
from paddlerec.core.utils import envs cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
hash_dim_ = 1000001
continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
class TrainReader(Reader): class CriteoDataset(dg.MultiSlotDataGenerator):
def init(self): """
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading
self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675
self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] """
self.hash_dim_ = envs.get_global_env("hyper_parameters.sparse_feature_number", None, "train.model")
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
def generate_sample(self, line): def generate_sample(self, line):
""" """
...@@ -37,25 +38,34 @@ class TrainReader(Reader): ...@@ -37,25 +38,34 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format This function needs to be implemented by the user, based on data format
""" """
features = line.rstrip('\n').split('\t') features = line.rstrip('\n').split('\t')
dense_feature = [] dense_feature = []
sparse_feature = [] sparse_feature = []
for idx in self.continuous_range_: for idx in continuous_range_:
if features[idx] == "": if features[idx] == "":
dense_feature.append(0.0) dense_feature.append(0.0)
else: else:
dense_feature.append( dense_feature.append(
(float(features[idx]) - self.cont_min_[idx - 1]) / (float(features[idx]) - cont_min_[idx - 1]) /
self.cont_diff_[idx - 1]) cont_diff_[idx - 1])
for idx in categorical_range_:
for idx in self.categorical_range_:
sparse_feature.append( sparse_feature.append(
[hash(str(idx) + features[idx]) % self.hash_dim_]) [hash(str(idx) + features[idx]) % hash_dim_])
label = [int(features[0])] label = [int(features[0])]
feature_name = ["D"] process_line = dense_feature, sparse_feature, label
for idx in self.categorical_range_: feature_name = ["dense_feature"]
feature_name.append("S" + str(idx - 13)) for idx in categorical_range_:
feature_name.append("C" + str(idx - 13))
feature_name.append("label") feature_name.append("label")
yield zip(feature_name, [dense_feature] + sparse_feature + [label]) s = "click:" + str(label[0])
for i in dense_feature:
s += " dense_feature:" + str(i)
for i in range(1, 1 + len(categorical_range_)):
s += " " + str(i) + ":" + str(sparse_feature[i - 1][0])
print s.strip()
yield None
return reader return reader
d = CriteoDataset()
d.run_from_stdin()
sh download.sh
mkdir slot_train_data_full
for i in `ls ./train_data_full`
do
cat train_data_full/$i | python get_slot_data.py > slot_train_data_full/$i
done
mkdir slot_test_data_full
for i in `ls ./test_data_full`
do
cat test_data_full/$i | python get_slot_data.py > slot_test_data_full/$i
done
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
此差异已折叠。
此差异已折叠。
...@@ -25,48 +25,16 @@ class Model(ModelBase): ...@@ -25,48 +25,16 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def input(self): def input(self):
def sparse_inputs(): self.sparse_inputs = self._sparse_data_var[1:]
ids = envs.get_global_env("hyper_parameters.sparse_inputs_slots", None, self._namespace) self.dense_input = self._dense_data_var[0]
self.label_input = self._sparse_data_var[0]
sparse_input_ids = [
fluid.layers.data(name="S" + str(i),
shape=[1],
lod_level=1,
dtype="int64") for i in range(1, ids)
]
return sparse_input_ids
def dense_input():
dim = envs.get_global_env("hyper_parameters.dense_input_dim", None, self._namespace)
dense_input_var = fluid.layers.data(name="D",
shape=[dim],
dtype="float32")
return dense_input_var
def label_input():
label = fluid.layers.data(name="click", shape=[1], dtype="int64")
return label
self.sparse_inputs = sparse_inputs()
self.dense_input = dense_input()
self.label_input = label_input()
self._data_var.append(self.dense_input)
for input in self.sparse_inputs:
self._data_var.append(input)
self._data_var.append(self.label_input)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
def net(self): def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_number = envs.get_global_env(
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) "hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
def embedding_layer(input): def embedding_layer(input):
emb = fluid.layers.embedding( emb = fluid.layers.embedding(
...@@ -76,25 +44,27 @@ class Model(ModelBase): ...@@ -76,25 +44,27 @@ class Model(ModelBase):
size=[sparse_feature_number, sparse_feature_dim], size=[sparse_feature_number, sparse_feature_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="SparseFeatFactors", name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()), initializer=fluid.initializer.Uniform()), )
) emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
emb_sum = fluid.layers.sequence_pool(
input=emb, pool_type='sum')
return emb_sum return emb_sum
def fc(input, output_size): def fc(input, output_size):
output = fluid.layers.fc( output = fluid.layers.fc(
input=input, size=output_size, input=input,
act='relu', param_attr=fluid.ParamAttr( size=output_size,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal( initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(input.shape[1])))) scale=1.0 / math.sqrt(input.shape[1]))))
return output return output
sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs)) sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1) concated = fluid.layers.concat(
sparse_embed_seq + [self.dense_input], axis=1)
fcs = [concated] fcs = [concated]
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
for size in hidden_layers: for size in hidden_layers:
fcs.append(fc(fcs[-1], size)) fcs.append(fc(fcs[-1], size))
...@@ -109,29 +79,33 @@ class Model(ModelBase): ...@@ -109,29 +79,33 @@ class Model(ModelBase):
self.predict = predict self.predict = predict
def avg_loss(self): def avg_loss(self):
cost = fluid.layers.cross_entropy(input=self.predict, label=self.label_input) cost = fluid.layers.cross_entropy(
input=self.predict, label=self.label_input)
avg_cost = fluid.layers.reduce_mean(cost) avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost self._cost = avg_cost
def metrics(self): def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self.predict, auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
label=self.label_input, label=self.label_input,
num_thresholds=2 ** 12, num_thresholds=2**12,
slide_steps=20) slide_steps=20)
self._metrics["AUC"] = auc self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc self._metrics["BATCH_AUC"] = batch_auc
def train_net(self): def train_net(self):
self.model._init_slots()
self.input() self.input()
self.net() self.net()
self.avg_loss() self.avg_loss()
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
def infer_net(self): def infer_net(self):
self.model._init_slots()
self.input() self.input()
self.net() self.net()
...@@ -59,6 +59,13 @@ ...@@ -59,6 +59,13 @@
## 使用教程 ## 使用教程
### 数据处理 ### 数据处理
参考每个模型目录数据下载&预处理脚本 参考每个模型目录数据下载&预处理脚本
```
sh run.sh
```
数据读取默认使用core/reader.py
### 训练 ### 训练
``` ```
python -m paddlerec.run -m paddlerec.models.rank.dnn # 以DNN为例 python -m paddlerec.run -m paddlerec.models.rank.dnn # 以DNN为例
......
...@@ -22,8 +22,9 @@ train: ...@@ -22,8 +22,9 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
class: "{workspace}/reader.py" train_data_path: "{workspace}/data/slot_train_data"
train_data_path: "{workspace}/data/train_data" sparse_slots: "label"
dense_slots: "wide_input:8 deep_input:58"
model: model:
models: "{workspace}/model.py" models: "{workspace}/model.py"
......
mkdir train_data mkdir train_data
mkdir test_data mkdir test_data
mkdir data train_path="adult.data"
train_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.data" test_path="adult.test"
test_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.test" train_data_path="./train_data/train_data.csv"
train_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/train_data/train_data.csv" test_data_path="./test_data/test_data.csv"
test_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/test_data/test_data.csv"
#pip install -r requirements.txt pip install -r requirements.txt
#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
python data_preparation.py --train_path ${train_path} \ python data_preparation.py --train_path ${train_path} \
--test_path ${test_path} \ --test_path ${test_path} \
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import io
import args
import pandas as pd
from sklearn import preprocessing
def _clean_file(source_path, target_path):
"""makes changes to match the CSV format."""
with io.open(source_path, 'r') as temp_eval_file:
with io.open(target_path, 'w') as eval_file:
for line in temp_eval_file:
line = line.strip()
line = line.replace(', ', ',')
if not line or ',' not in line:
continue
if line[-1] == '.':
line = line[:-1]
line += '\n'
eval_file.write(line)
def build_model_columns(train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
# Load the dataset in Pandas
train_df = pd.read_csv(
train_data_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
test_df = pd.read_csv(
test_data_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
categorical_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation'
]
for col in categorical_columns:
label_train = preprocessing.LabelEncoder()
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col] = label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(),
bins,
labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(),
bins,
labels=False)
base_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation', 'age_buckets'
]
train_df['education_occupation'] = train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df[
'age_buckets'].astype(str) + '_' + train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df[
'age_buckets'].astype(str) + '_' + test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
crossed_columns = [
'education_occupation', 'age_buckets_education_occupation'
]
for col in crossed_columns:
label_train = preprocessing.LabelEncoder()
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col] = label_test.fit_transform(test_df[col])
wide_columns = base_columns + crossed_columns
train_df_temp = pd.get_dummies(
train_df[categorical_columns], columns=categorical_columns)
test_df_temp = pd.get_dummies(
test_df[categorical_columns], columns=categorical_columns)
train_df = train_df.join(train_df_temp)
test_df = test_df.join(test_df_temp)
deep_columns = list(train_df_temp.columns) + [
'age', 'education_num', 'capital_gain', 'capital_loss',
'hours_per_week'
]
train_df['label'] = train_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
with io.open('test_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
train_data_path, index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
test_data_path, index=False)
def clean_file(train_path, test_path, train_data_path, test_data_path):
_clean_file(train_path, train_data_path)
_clean_file(test_path, test_data_path)
if __name__ == '__main__':
args = args.parse_args()
clean_file(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
build_model_columns(args.train_data_path, args.test_data_path)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册