未验证 提交 633ecc2c 编写于 作者: D Dong Daxiang 提交者: GitHub

Merge pull request #9 from seiriosPlus/travis

Travis
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: v1.0.1
hooks:
- id: remove-crlf
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- id: end-of-file-fixer
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
language: generic
sudo: required
dist: trusty
services:
- docker
os:
- linux
env:
- JOB=check_style
before_install:
# For pylint dockstring checker
- sudo pip install pylint pytest astroid isort pre-commit
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- "travis_wait 30 sleep 1800 &"
- |
# 43min timeout
tools/build_script.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
notifications:
email:
on_success: change
on_failure: always
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -27,6 +27,7 @@ from paddlerec.core.utils import envs ...@@ -27,6 +27,7 @@ from paddlerec.core.utils import envs
class ClusterEngine(Engine): class ClusterEngine(Engine):
def __init_impl__(self): def __init_impl__(self):
abs_dir = os.path.dirname(os.path.abspath(__file__)) abs_dir = os.path.dirname(os.path.abspath(__file__))
backend = envs.get_runtime_environ("engine_backend") backend = envs.get_runtime_environ("engine_backend")
if backend == "PaddleCloud": if backend == "PaddleCloud":
self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
...@@ -57,4 +58,5 @@ class ClusterEngine(Engine): ...@@ -57,4 +58,5 @@ class ClusterEngine(Engine):
self.start_worker_procs() self.start_worker_procs()
else: else:
raise ValueError("role {} error, must in MASTER/WORKER".format(role)) raise ValueError("role {} error, must in MASTER/WORKER".format(
role))
...@@ -46,10 +46,13 @@ class LocalClusterEngine(Engine): ...@@ -46,10 +46,13 @@ class LocalClusterEngine(Engine):
ports.append(new_port) ports.append(new_port)
break break
user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
user_endpoints_ips = [x.split(":")[0]
for x in user_endpoints.split(",")] user_endpoints_ips = [
user_endpoints_port = [x.split(":")[1] x.split(":")[0] for x in user_endpoints.split(",")
for x in user_endpoints.split(",")] ]
user_endpoints_port = [
x.split(":")[1] for x in user_endpoints.split(",")
]
factory = "paddlerec.core.factory" factory = "paddlerec.core.factory"
cmd = [sys.executable, "-u", "-m", factory, self.trainer] cmd = [sys.executable, "-u", "-m", factory, self.trainer]
...@@ -97,8 +100,10 @@ class LocalClusterEngine(Engine): ...@@ -97,8 +100,10 @@ class LocalClusterEngine(Engine):
if len(log_fns) > 0: if len(log_fns) > 0:
log_fns[i].close() log_fns[i].close()
procs[i].terminate() procs[i].terminate()
print("all workers already completed, you can view logs under the `{}` directory".format(logs_dir), print(
file=sys.stderr) "all workers already completed, you can view logs under the `{}` directory".
format(logs_dir),
file=sys.stderr)
def run(self): def run(self):
self.start_procs() self.start_procs()
...@@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine ...@@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine
class LocalMPIEngine(Engine): class LocalMPIEngine(Engine):
def start_procs(self): def start_procs(self):
logs_dir = self.envs["log_dir"] logs_dir = self.envs["log_dir"]
default_env = os.environ.copy() default_env = os.environ.copy()
current_env = copy.copy(default_env) current_env = copy.copy(default_env)
current_env.pop("http_proxy", None) current_env.pop("http_proxy", None)
...@@ -42,7 +41,8 @@ class LocalMPIEngine(Engine): ...@@ -42,7 +41,8 @@ class LocalMPIEngine(Engine):
os.system("mkdir -p {}".format(logs_dir)) os.system("mkdir -p {}".format(logs_dir))
fn = open("%s/job.log" % logs_dir, "w") fn = open("%s/job.log" % logs_dir, "w")
log_fns.append(fn) log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) proc = subprocess.Popen(
cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
else: else:
proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd()) proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd())
procs.append(proc) procs.append(proc)
...@@ -51,7 +51,9 @@ class LocalMPIEngine(Engine): ...@@ -51,7 +51,9 @@ class LocalMPIEngine(Engine):
if len(log_fns) > 0: if len(log_fns) > 0:
log_fns[i].close() log_fns[i].close()
procs[i].wait() procs[i].wait()
print("all workers and parameter servers already completed", file=sys.stderr) print(
"all workers and parameter servers already completed",
file=sys.stderr)
def run(self): def run(self):
self.start_procs() self.start_procs()
...@@ -19,24 +19,23 @@ import yaml ...@@ -19,24 +19,23 @@ import yaml
from paddlerec.core.utils import envs from paddlerec.core.utils import envs
trainer_abs = os.path.join(os.path.dirname( trainer_abs = os.path.join(
os.path.abspath(__file__)), "trainers") os.path.dirname(os.path.abspath(__file__)), "trainers")
trainers = {} trainers = {}
def trainer_registry(): def trainer_registry():
trainers["SingleTrainer"] = os.path.join( trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py")
trainer_abs, "single_trainer.py") trainers["ClusterTrainer"] = os.path.join(trainer_abs,
trainers["ClusterTrainer"] = os.path.join( "cluster_trainer.py")
trainer_abs, "cluster_trainer.py") trainers["CtrCodingTrainer"] = os.path.join(trainer_abs,
trainers["CtrCodingTrainer"] = os.path.join( "ctr_coding_trainer.py")
trainer_abs, "ctr_coding_trainer.py") trainers["CtrModulTrainer"] = os.path.join(trainer_abs,
trainers["CtrModulTrainer"] = os.path.join( "ctr_modul_trainer.py")
trainer_abs, "ctr_modul_trainer.py") trainers["TDMSingleTrainer"] = os.path.join(trainer_abs,
trainers["TDMSingleTrainer"] = os.path.join( "tdm_single_trainer.py")
trainer_abs, "tdm_single_trainer.py") trainers["TDMClusterTrainer"] = os.path.join(trainer_abs,
trainers["TDMClusterTrainer"] = os.path.join( "tdm_cluster_trainer.py")
trainer_abs, "tdm_cluster_trainer.py")
trainer_registry() trainer_registry()
...@@ -55,8 +54,8 @@ class TrainerFactory(object): ...@@ -55,8 +54,8 @@ class TrainerFactory(object):
if trainer_abs is None: if trainer_abs is None:
if not os.path.isfile(train_mode): if not os.path.isfile(train_mode):
raise IOError( raise IOError("trainer {} can not be recognized".format(
"trainer {} can not be recognized".format(train_mode)) train_mode))
trainer_abs = train_mode trainer_abs = train_mode
train_mode = "UserDefineTrainer" train_mode = "UserDefineTrainer"
......
...@@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric ...@@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric
class AUCMetric(Metric): class AUCMetric(Metric):
""" """
Metric For Paddle Model Metric For Fluid Model
""" """
def __init__(self, config, fleet): def __init__(self, config, fleet):
...@@ -83,7 +83,8 @@ class AUCMetric(Metric): ...@@ -83,7 +83,8 @@ class AUCMetric(Metric):
if scope.find_var(metric_item['var'].name) is None: if scope.find_var(metric_item['var'].name) is None:
result[metric_name] = None result[metric_name] = None
continue continue
result[metric_name] = self.get_metric(scope, metric_item['var'].name) result[metric_name] = self.get_metric(scope,
metric_item['var'].name)
return result return result
def calculate_auc(self, global_pos, global_neg): def calculate_auc(self, global_pos, global_neg):
...@@ -178,14 +179,18 @@ class AUCMetric(Metric): ...@@ -178,14 +179,18 @@ class AUCMetric(Metric):
self._result['mean_q'] = 0 self._result['mean_q'] = 0
return self._result return self._result
if 'stat_pos' in result and 'stat_neg' in result: if 'stat_pos' in result and 'stat_neg' in result:
result['auc'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) result['auc'] = self.calculate_auc(result['stat_pos'],
result['bucket_error'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
if 'pos_ins_num' in result: if 'pos_ins_num' in result:
result['actual_ctr'] = result['pos_ins_num'] / result['total_ins_num'] result['actual_ctr'] = result['pos_ins_num'] / result[
'total_ins_num']
if 'abserr' in result: if 'abserr' in result:
result['mae'] = result['abserr'] / result['total_ins_num'] result['mae'] = result['abserr'] / result['total_ins_num']
if 'sqrerr' in result: if 'sqrerr' in result:
result['rmse'] = math.sqrt(result['sqrerr'] / result['total_ins_num']) result['rmse'] = math.sqrt(result['sqrerr'] /
result['total_ins_num'])
if 'prob' in result: if 'prob' in result:
result['predict_ctr'] = result['prob'] / result['total_ins_num'] result['predict_ctr'] = result['prob'] / result['total_ins_num']
if abs(result['predict_ctr']) > 1e-6: if abs(result['predict_ctr']) > 1e-6:
......
...@@ -20,7 +20,7 @@ from paddlerec.core.utils import envs ...@@ -20,7 +20,7 @@ from paddlerec.core.utils import envs
class Model(object): class Model(object):
"""R """Base Model
""" """
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
...@@ -39,32 +39,43 @@ class Model(object): ...@@ -39,32 +39,43 @@ class Model(object):
self._platform = envs.get_platform() self._platform = envs.get_platform()
def _init_slots(self): def _init_slots(self):
sparse_slots = envs.get_global_env("sparse_slots", None, "train.reader") sparse_slots = envs.get_global_env("sparse_slots", None,
"train.reader")
dense_slots = envs.get_global_env("dense_slots", None, "train.reader") dense_slots = envs.get_global_env("dense_slots", None, "train.reader")
if sparse_slots is not None or dense_slots is not None: if sparse_slots is not None or dense_slots is not None:
sparse_slots = sparse_slots.strip().split(" ") sparse_slots = sparse_slots.strip().split(" ")
dense_slots = dense_slots.strip().split(" ") dense_slots = dense_slots.strip().split(" ")
dense_slots_shape = [[int(j) for j in i.split(":")[1].strip("[]").split(",")] for i in dense_slots] dense_slots_shape = [[
int(j) for j in i.split(":")[1].strip("[]").split(",")
] for i in dense_slots]
dense_slots = [i.split(":")[0] for i in dense_slots] dense_slots = [i.split(":")[0] for i in dense_slots]
self._dense_data_var = [] self._dense_data_var = []
for i in range(len(dense_slots)): for i in range(len(dense_slots)):
l = fluid.layers.data(name=dense_slots[i], shape=dense_slots_shape[i], dtype="float32") l = fluid.layers.data(
name=dense_slots[i],
shape=dense_slots_shape[i],
dtype="float32")
self._data_var.append(l) self._data_var.append(l)
self._dense_data_var.append(l) self._dense_data_var.append(l)
self._sparse_data_var = [] self._sparse_data_var = []
for name in sparse_slots: for name in sparse_slots:
l = fluid.layers.data(name=name, shape=[1], lod_level=1, dtype="int64") l = fluid.layers.data(
name=name, shape=[1], lod_level=1, dtype="int64")
self._data_var.append(l) self._data_var.append(l)
self._sparse_data_var.append(l) self._sparse_data_var.append(l)
dataset_class = envs.get_global_env("dataset_class", None, "train.reader") dataset_class = envs.get_global_env("dataset_class", None,
"train.reader")
if dataset_class == "DataLoader": if dataset_class == "DataLoader":
self._init_dataloader() self._init_dataloader()
def _init_dataloader(self): def _init_dataloader(self):
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_inputs(self): def get_inputs(self):
return self._data_var return self._data_var
...@@ -96,8 +107,8 @@ class Model(object): ...@@ -96,8 +107,8 @@ class Model(object):
"configured optimizer can only supported SGD/Adam/Adagrad") "configured optimizer can only supported SGD/Adam/Adagrad")
if name == "SGD": if name == "SGD":
reg = envs.get_global_env( reg = envs.get_global_env("hyper_parameters.reg", 0.0001,
"hyper_parameters.reg", 0.0001, self._namespace) self._namespace)
optimizer_i = fluid.optimizer.SGD( optimizer_i = fluid.optimizer.SGD(
lr, regularization=fluid.regularizer.L2DecayRegularizer(reg)) lr, regularization=fluid.regularizer.L2DecayRegularizer(reg))
elif name == "ADAM": elif name == "ADAM":
...@@ -111,10 +122,10 @@ class Model(object): ...@@ -111,10 +122,10 @@ class Model(object):
return optimizer_i return optimizer_i
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env( learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
"hyper_parameters.learning_rate", None, self._namespace) None, self._namespace)
optimizer = envs.get_global_env( optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
"hyper_parameters.optimizer", None, self._namespace) self._namespace)
print(">>>>>>>>>>>.learnig rate: %s" % learning_rate) print(">>>>>>>>>>>.learnig rate: %s" % learning_rate)
return self._build_optimizer(optimizer, learning_rate) return self._build_optimizer(optimizer, learning_rate)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -31,6 +31,7 @@ def create(config): ...@@ -31,6 +31,7 @@ def create(config):
Model Instance Model Instance
""" """
model = None model = None
if config['mode'] == 'fluid': if config['mode'] == 'fluid':
model = YamlModel(config) model = YamlModel(config)
model.train_net() model.train_net()
...@@ -50,7 +51,12 @@ class YamlModel(Model): ...@@ -50,7 +51,12 @@ class YamlModel(Model):
f = open(config['layer_file'], 'r') f = open(config['layer_file'], 'r')
self._build_nodes = yaml.safe_load(f.read()) self._build_nodes = yaml.safe_load(f.read())
self._build_phase = ['input', 'param', 'summary', 'layer'] self._build_phase = ['input', 'param', 'summary', 'layer']
self._build_param = {'layer': {}, 'inner_layer': {}, 'layer_extend': {}, 'model': {}} self._build_param = {
'layer': {},
'inner_layer': {},
'layer_extend': {},
'model': {}
}
self._inference_meta = {'dependency': {}, 'params': {}} self._inference_meta = {'dependency': {}, 'params': {}}
def train_net(self): def train_net(self):
...@@ -76,10 +82,12 @@ class YamlModel(Model): ...@@ -76,10 +82,12 @@ class YamlModel(Model):
if self._build_nodes[phase] is None: if self._build_nodes[phase] is None:
continue continue
for node in self._build_nodes[phase]: for node in self._build_nodes[phase]:
exec("""layer=layer.{}(node)""".format(node['class'])) exec ("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(self._config['mode'], self._build_param) layer_output, extend_output = layer.generate(
self._config['mode'], self._build_param)
self._build_param['layer'][node['name']] = layer_output self._build_param['layer'][node['name']] = layer_output
self._build_param['layer_extend'][node['name']] = extend_output self._build_param['layer_extend'][node[
'name']] = extend_output
if extend_output is None: if extend_output is None:
continue continue
if 'loss' in extend_output: if 'loss' in extend_output:
...@@ -89,17 +97,24 @@ class YamlModel(Model): ...@@ -89,17 +97,24 @@ class YamlModel(Model):
self._cost += extend_output['loss'] self._cost += extend_output['loss']
if 'data_var' in extend_output: if 'data_var' in extend_output:
self._data_var += extend_output['data_var'] self._data_var += extend_output['data_var']
if 'metric_label' in extend_output and extend_output['metric_label'] is not None: if 'metric_label' in extend_output and extend_output[
self._metrics[extend_output['metric_label']] = extend_output['metric_dict'] 'metric_label'] is not None:
self._metrics[extend_output[
'metric_label']] = extend_output['metric_dict']
if 'inference_param' in extend_output: if 'inference_param' in extend_output:
inference_param = extend_output['inference_param'] inference_param = extend_output['inference_param']
param_name = inference_param['name'] param_name = inference_param['name']
if param_name not in self._build_param['table']: if param_name not in self._build_param['table']:
self._build_param['table'][param_name] = {'params': []} self._build_param['table'][param_name] = {
table_meta = table.TableMeta.alloc_new_table(inference_param['table_id']) 'params': []
self._build_param['table'][param_name]['_meta'] = table_meta }
self._build_param['table'][param_name]['params'] += inference_param['params'] table_meta = table.TableMeta.alloc_new_table(
inference_param['table_id'])
self._build_param['table'][param_name][
'_meta'] = table_meta
self._build_param['table'][param_name][
'params'] += inference_param['params']
pass pass
@classmethod @classmethod
...@@ -114,20 +129,25 @@ class YamlModel(Model): ...@@ -114,20 +129,25 @@ class YamlModel(Model):
metrics = params['metrics'] metrics = params['metrics']
for name in metrics: for name in metrics:
model_metrics = metrics[name] model_metrics = metrics[name]
stat_var_names += [model_metrics[metric]['var'].name for metric in model_metrics] stat_var_names += [
model_metrics[metric]['var'].name
for metric in model_metrics
]
strategy['stat_var_names'] = list(set(stat_var_names)) strategy['stat_var_names'] = list(set(stat_var_names))
optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \ optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \
'(learning_rate=' + str(optimizer_conf['learning_rate']) + ')' '(learning_rate=' + str(optimizer_conf['learning_rate']) + ')'
exec(optimizer_generator) exec (optimizer_generator)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
return optimizer return optimizer
def dump_model_program(self, path): def dump_model_program(self, path):
"""R """R
""" """
with open(path + '/' + self._name + '_main_program.pbtxt', "w") as fout: with open(path + '/' + self._name + '_main_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['train_program'] print >> fout, self._build_param['model']['train_program']
with open(path + '/' + self._name + '_startup_program.pbtxt', "w") as fout: with open(path + '/' + self._name + '_startup_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['startup_program'] print >> fout, self._build_param['model']['startup_program']
pass pass
...@@ -137,7 +157,8 @@ class YamlModel(Model): ...@@ -137,7 +157,8 @@ class YamlModel(Model):
scope = params['scope'] scope = params['scope']
decay = params['decay'] decay = params['decay']
for param_table in self._build_param['table']: for param_table in self._build_param['table']:
table_id = self._build_param['table'][param_table]['_meta']._table_id table_id = self._build_param['table'][param_table][
'_meta']._table_id
fleet.shrink_dense_table(decay, scope=scope, table_id=table_id) fleet.shrink_dense_table(decay, scope=scope, table_id=table_id)
def dump_inference_program(self, inference_layer, path): def dump_inference_program(self, inference_layer, path):
...@@ -152,17 +173,25 @@ class YamlModel(Model): ...@@ -152,17 +173,25 @@ class YamlModel(Model):
executor = params['executor'] executor = params['executor']
program = self._build_param['model']['train_program'] program = self._build_param['model']['train_program']
for table_name, table in self._build_param['table'].items(): for table_name, table in self._build_param['table'].items():
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, table['params']) fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id,
table['params'])
for infernce_item in params['inference_list']: for infernce_item in params['inference_list']:
params_name_list = self.inference_params(infernce_item['layer_name']) params_name_list = self.inference_params(infernce_item[
params_var_list = [program.global_block().var(i) for i in params_name_list] 'layer_name'])
params_var_list = [
program.global_block().var(i) for i in params_name_list
]
params_file_name = infernce_item['save_file_name'] params_file_name = infernce_item['save_file_name']
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
if params['save_combine']: if params['save_combine']:
fluid.io.save_vars(executor, "./", \ fluid.io.save_vars(executor, "./", \
program, vars=params_var_list, filename=params_file_name) program, vars=params_var_list, filename=params_file_name)
else: else:
fluid.io.save_vars(executor, params_file_name, program, vars=params_var_list) fluid.io.save_vars(
executor,
params_file_name,
program,
vars=params_var_list)
def inference_params(self, inference_layer): def inference_params(self, inference_layer):
""" """
...@@ -177,11 +206,13 @@ class YamlModel(Model): ...@@ -177,11 +206,13 @@ class YamlModel(Model):
return self._inference_meta['params'][layer] return self._inference_meta['params'][layer]
self._inference_meta['params'][layer] = [] self._inference_meta['params'][layer] = []
self._inference_meta['dependency'][layer] = self.get_dependency(self._build_param['inner_layer'], layer) self._inference_meta['dependency'][layer] = self.get_dependency(
self._build_param['inner_layer'], layer)
for node in self._build_nodes['layer']: for node in self._build_nodes['layer']:
if node['name'] not in self._inference_meta['dependency'][layer]: if node['name'] not in self._inference_meta['dependency'][layer]:
continue continue
if 'inference_param' in self._build_param['layer_extend'][node['name']]: if 'inference_param' in self._build_param['layer_extend'][node[
'name']]:
self._inference_meta['params'][layer] += \ self._inference_meta['params'][layer] += \
self._build_param['layer_extend'][node['name']]['inference_param']['params'] self._build_param['layer_extend'][node['name']]['inference_param']['params']
return self._inference_meta['params'][layer] return self._inference_meta['params'][layer]
...@@ -199,5 +230,6 @@ class YamlModel(Model): ...@@ -199,5 +230,6 @@ class YamlModel(Model):
dependencys = copy.deepcopy(layer_graph[dest_layer]['input']) dependencys = copy.deepcopy(layer_graph[dest_layer]['input'])
dependency_list = copy.deepcopy(dependencys) dependency_list = copy.deepcopy(dependencys)
for dependency in dependencys: for dependency in dependencys:
dependency_list = dependency_list + self.get_dependency(layer_graph, dependency) dependency_list = dependency_list + self.get_dependency(
layer_graph, dependency)
return list(set(dependency_list)) return list(set(dependency_list))
...@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer ...@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer
class EmbeddingFuseLayer(Layer): class EmbeddingFuseLayer(Layer):
"""R """embedding + sequence + concat
""" """
def __init__(self, config): def __init__(self, config):
...@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer): ...@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer):
show_clk.stop_gradient = True show_clk.stop_gradient = True
data_var = [] data_var = []
for slot in self._slots: for slot in self._slots:
l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) l = fluid.layers.data(
name=slot, shape=[1], dtype="int64", lod_level=1)
data_var.append(l) data_var.append(l)
emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \ emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \
is_sparse=True, is_distributed=True, is_sparse=True, is_distributed=True,
...@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer): ...@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer):
emb = fluid.layers.sequence_pool(input=emb, pool_type='sum') emb = fluid.layers.sequence_pool(input=emb, pool_type='sum')
emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm) emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm)
self._emb_layers.append(emb) self._emb_layers.append(emb)
output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name) output = fluid.layers.concat(
input=self._emb_layers, axis=1, name=self._name)
return output, {'data_var': data_var} return output, {'data_var': data_var}
...@@ -111,7 +113,13 @@ class ParamLayer(Layer): ...@@ -111,7 +113,13 @@ class ParamLayer(Layer):
def generate(self, param): def generate(self, param):
"""R """R
""" """
return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}} return self._config, {
'inference_param': {
'name': 'param',
'params': [],
'table_id': self._table_id
}
}
class SummaryLayer(Layer): class SummaryLayer(Layer):
...@@ -129,7 +137,13 @@ class SummaryLayer(Layer): ...@@ -129,7 +137,13 @@ class SummaryLayer(Layer):
def generate(self, param): def generate(self, param):
"""R """R
""" """
return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}} return self._config, {
'inference_param': {
'name': 'summary',
'params': [],
'table_id': self._table_id
}
}
class NormalizationLayer(Layer): class NormalizationLayer(Layer):
...@@ -152,9 +166,19 @@ class NormalizationLayer(Layer): ...@@ -152,9 +166,19 @@ class NormalizationLayer(Layer):
if len(self._input) > 0: if len(self._input) > 0:
input_list = [param['layer'][i] for i in self._input] input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1) input_layer = fluid.layers.concat(input=input_list, axis=1)
bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={ bn = fluid.layers.data_norm(
"batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4}) input=input_layer,
inference_param = [self._name + '.batch_size', self._name + '.batch_sum', self._name + '.batch_square_sum'] name=self._name,
epsilon=1e-4,
param_attr={
"batch_size": 1e4,
"batch_sum_default": 0.0,
"batch_square": 1e4
})
inference_param = [
self._name + '.batch_size', self._name + '.batch_sum',
self._name + '.batch_square_sum'
]
return bn, {'inference_param': {'name': 'summary', \ return bn, {'inference_param': {'name': 'summary', \
'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}} 'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}}
...@@ -181,11 +205,13 @@ class FCLayer(Layer): ...@@ -181,11 +205,13 @@ class FCLayer(Layer):
input_list = [param['layer'][i] for i in self._input] input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1) input_layer = fluid.layers.concat(input=input_list, axis=1)
input_coln = input_layer.shape[1] input_coln = input_layer.shape[1]
scale = param_layer['init_range'] / (input_coln ** 0.5) scale = param_layer['init_range'] / (input_coln**0.5)
bias = None bias = None
if self._bias: if self._bias:
bias = fluid.ParamAttr(learning_rate=1.0, bias = fluid.ParamAttr(
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale)) learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale))
fc = fluid.layers.fc( fc = fluid.layers.fc(
name=self._name, name=self._name,
input=input_layer, input=input_layer,
...@@ -216,18 +242,46 @@ class LogLossLayer(Layer): ...@@ -216,18 +242,46 @@ class LogLossLayer(Layer):
self._extend_output = { self._extend_output = {
'metric_label': self._metric_label, 'metric_label': self._metric_label,
'metric_dict': { 'metric_dict': {
'auc': {'var': None}, 'auc': {
'batch_auc': {'var': None}, 'var': None
'stat_pos': {'var': None, 'data_type': 'int64'}, },
'stat_neg': {'var': None, 'data_type': 'int64'}, 'batch_auc': {
'batch_stat_pos': {'var': None, 'data_type': 'int64'}, 'var': None
'batch_stat_neg': {'var': None, 'data_type': 'int64'}, },
'pos_ins_num': {'var': None}, 'stat_pos': {
'abserr': {'var': None}, 'var': None,
'sqrerr': {'var': None}, 'data_type': 'int64'
'prob': {'var': None}, },
'total_ins_num': {'var': None}, 'stat_neg': {
'q': {'var': None} 'var': None,
'data_type': 'int64'
},
'batch_stat_pos': {
'var': None,
'data_type': 'int64'
},
'batch_stat_neg': {
'var': None,
'data_type': 'int64'
},
'pos_ins_num': {
'var': None
},
'abserr': {
'var': None
},
'sqrerr': {
'var': None
},
'prob': {
'var': None
},
'total_ins_num': {
'var': None
},
'q': {
'var': None
}
} }
} }
...@@ -236,9 +290,12 @@ class LogLossLayer(Layer): ...@@ -236,9 +290,12 @@ class LogLossLayer(Layer):
""" """
input_layer = param['layer'][self._input[0]] input_layer = param['layer'][self._input[0]]
label_layer = param['layer'][self._label] label_layer = param['layer'][self._label]
output = fluid.layers.clip(input_layer, self._bound[0], self._bound[1], name=self._name) output = fluid.layers.clip(
input_layer, self._bound[0], self._bound[1], name=self._name)
norm = fluid.layers.sigmoid(output, name=self._name) norm = fluid.layers.sigmoid(output, name=self._name)
output = fluid.layers.log_loss(norm, fluid.layers.cast(x=label_layer, dtype='float32')) output = fluid.layers.log_loss(
norm, fluid.layers.cast(
x=label_layer, dtype='float32'))
if self._weight: if self._weight:
weight_layer = param['layer'][self._weight] weight_layer = param['layer'][self._weight]
output = fluid.layers.elementwise_mul(output, weight_layer) output = fluid.layers.elementwise_mul(output, weight_layer)
...@@ -248,7 +305,11 @@ class LogLossLayer(Layer): ...@@ -248,7 +305,11 @@ class LogLossLayer(Layer):
# For AUC Metric # For AUC Metric
metric = self._extend_output['metric_dict'] metric = self._extend_output['metric_dict']
binary_predict = fluid.layers.concat( binary_predict = fluid.layers.concat(
input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1) input=[
fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm),
norm
],
axis=1)
metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \ metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \
metric['batch_stat_neg']['var'], metric['stat_pos']['var'], metric['batch_stat_neg']['var'], metric['stat_pos']['var'],
metric['stat_neg']['var']] = \ metric['stat_neg']['var']] = \
......
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import sys
import abc import abc
import os import os
...@@ -64,7 +64,11 @@ class SlotReader(dg.MultiSlotDataGenerator): ...@@ -64,7 +64,11 @@ class SlotReader(dg.MultiSlotDataGenerator):
from operator import mul from operator import mul
self.sparse_slots = sparse_slots.strip().split(" ") self.sparse_slots = sparse_slots.strip().split(" ")
self.dense_slots = dense_slots.strip().split(" ") self.dense_slots = dense_slots.strip().split(" ")
self.dense_slots_shape = [reduce(mul, [int(j) for j in i.split(":")[1].strip("[]").split(",")]) for i in self.dense_slots] self.dense_slots_shape = [
reduce(mul,
[int(j) for j in i.split(":")[1].strip("[]").split(",")])
for i in self.dense_slots
]
self.dense_slots = [i.split(":")[0] for i in self.dense_slots] self.dense_slots = [i.split(":")[0] for i in self.dense_slots]
self.slots = self.dense_slots + self.sparse_slots self.slots = self.dense_slots + self.sparse_slots
self.slot2index = {} self.slot2index = {}
...@@ -93,10 +97,13 @@ class SlotReader(dg.MultiSlotDataGenerator): ...@@ -93,10 +97,13 @@ class SlotReader(dg.MultiSlotDataGenerator):
slot = i slot = i
if not self.visit[slot]: if not self.visit[slot]:
if i in self.dense_slots: if i in self.dense_slots:
output[self.slot2index[i]][1].extend([self.padding] * self.dense_slots_shape[self.slot2index[i]]) output[self.slot2index[i]][1].extend(
[self.padding] *
self.dense_slots_shape[self.slot2index[i]])
else: else:
output[self.slot2index[i]][1].extend([self.padding]) output[self.slot2index[i]][1].extend([self.padding])
else: else:
self.visit[slot] = False self.visit[slot] = False
yield output yield output
return reader return reader
...@@ -30,8 +30,10 @@ class Trainer(object): ...@@ -30,8 +30,10 @@ class Trainer(object):
def __init__(self, config=None): def __init__(self, config=None):
self._status_processor = {} self._status_processor = {}
self._place = fluid.CPUPlace() self._place = fluid.CPUPlace()
self._exe = fluid.Executor(self._place) self._exe = fluid.Executor(self._place)
self._exector_context = {} self._exector_context = {}
self._context = {'status': 'uninit', 'is_exit': False} self._context = {'status': 'uninit', 'is_exit': False}
self._config_yaml = config self._config_yaml = config
...@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml): ...@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml):
train_dirname = os.path.dirname(train_location) train_dirname = os.path.dirname(train_location)
base_name = os.path.splitext(os.path.basename(train_location))[0] base_name = os.path.splitext(os.path.basename(train_location))[0]
sys.path.append(train_dirname) sys.path.append(train_dirname)
trainer_class = envs.lazy_instance_by_fliename( trainer_class = envs.lazy_instance_by_fliename(base_name,
base_name, "UserDefineTraining") "UserDefineTraining")
return trainer_class return trainer_class
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
trainer implement. trainer implement.
...@@ -22,5 +21,3 @@ Trainer ...@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer ↘ (for online learning training) OnlineLearningTrainer
""" """
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer): ...@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance) self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init) self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup) self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train) self.regist_context_processor('train_pass', self.dataset_train)
else: else:
self.regist_context_processor( self.regist_context_processor('train_pass',
'train_pass', self.dataloader_train) self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal) self.regist_context_processor('terminal_pass', self.terminal)
...@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer): ...@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer):
def init(self, context): def init(self, context):
self.model.train_net() self.model.train_net()
optimizer = self.model.optimizer() optimizer = self.model.optimizer()
optimizer_name = envs.get_global_env( optimizer_name = envs.get_global_env("hyper_parameters.optimizer",
"hyper_parameters.optimizer", None, "train.model") None, "train.model")
if optimizer_name not in ["", "sgd", "SGD", "Sgd"]: if optimizer_name not in ["", "sgd", "SGD", "Sgd"]:
os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0' os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0'
...@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer): ...@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer):
program = fluid.compiler.CompiledProgram( program = fluid.compiler.CompiledProgram(
fleet.main_program).with_data_parallel( fleet.main_program).with_data_parallel(
loss_name=self.model.get_avg_cost().name, loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(), build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy()) exec_strategy=self.strategy.get_execute_strategy())
metrics_varnames = [] metrics_varnames = []
metrics_format = [] metrics_format = []
...@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer): ...@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer):
batch_id = 0 batch_id = 0
try: try:
while True: while True:
metrics_rets = self._exe.run( metrics_rets = self._exe.run(program=program,
program=program, fetch_list=metrics_varnames)
fetch_list=metrics_varnames)
metrics = [epoch, batch_id] metrics = [epoch, batch_id]
metrics.extend(metrics_rets) metrics.extend(metrics_rets)
...@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer): ...@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer):
for i in range(epochs): for i in range(epochs):
begin_time = time.time() begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(), self._exe.train_from_dataset(
dataset=dataset, program=fluid.default_main_program(),
fetch_list=self.fetch_vars, dataset=dataset,
fetch_info=self.fetch_alias, fetch_list=self.fetch_vars,
print_period=self.fetch_period) fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time() end_time = time.time()
times = end_time-begin_time times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True) self.save(i, "train", is_fleet=True)
fleet.stop_worker() fleet.stop_worker()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddlerec.core.utils import envs
from paddlerec.core.trainer import Trainer
class CtrTrainer(Trainer):
"""R
"""
def __init__(self, config):
"""R
"""
Trainer.__init__(self, config)
self.global_config = config
self._metrics = {}
self.processor_register()
def processor_register(self):
role = MPISymetricRoleMaker()
fleet.init(role)
if fleet.is_server():
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('server_pass', self.server)
else:
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('train_pass', self.train)
self.regist_context_processor('terminal_pass', self.terminal)
def _get_dataset(self):
namespace = "train.reader"
inputs = self.model.get_inputs()
threads = envs.get_global_env("train.threads", None)
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
dataset.set_pipe_command(pipe_cmd)
dataset.set_batch_size(batch_size)
dataset.set_thread(threads)
file_list = [
os.path.join(train_data_path, x)
for x in os.listdir(train_data_path)
]
dataset.set_filelist(file_list)
return dataset
def instance(self, context):
models = envs.get_global_env("train.model.models")
model_class = envs.lazy_instance_by_fliename(models, "Model")
self.model = model_class(None)
context['status'] = 'init_pass'
def init(self, context):
"""R
"""
self.model.train_net()
optimizer = self.model.optimizer()
optimizer = fleet.distributed_optimizer(
optimizer, strategy={"use_cvm": False})
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
context['status'] = 'server_pass'
else:
self.fetch_vars = []
self.fetch_alias = []
self.fetch_period = self.model.get_fetch_period()
metrics = self.model.get_metrics()
if metrics:
self.fetch_vars = metrics.values()
self.fetch_alias = metrics.keys()
context['status'] = 'train_pass'
def server(self, context):
fleet.run_server()
fleet.stop_worker()
context['is_exit'] = True
def train(self, context):
self._exe.run(fluid.default_startup_program())
fleet.init_worker()
dataset = self._get_dataset()
shuf = np.array([fleet.worker_index()])
gs = shuf * 0
fleet._role_maker._node_type_comm.Allreduce(shuf, gs)
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(
), fleet.worker_num(), gs))
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
context['status'] = 'terminal_pass'
fleet.stop_worker()
def terminal(self, context):
print("terminal ended.")
context['is_exit'] = True
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import json
import sys
import time
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
from paddlerec.core.utils import fs as fs
from paddlerec.core.utils import util as util
from paddlerec.core.metrics.auc_metrics import AUCMetric
from paddlerec.core.modules.modul import build as model_basic
from paddlerec.core.utils import dataset
from paddlerec.core.trainer import Trainer
def wroker_numric_opt(value, env, opt):
"""
numric count opt for workers
Args:
value: value for count
env: mpi/gloo
opt: count operator, SUM/MAX/MIN/AVG
Return:
count result
"""
local_value = np.array([value])
global_value = np.copy(local_value) * 0
fleet._role_maker.all_reduce_worker(local_value, global_value, opt)
return global_value[0]
def worker_numric_sum(value, env="mpi"):
"""R
"""
return wroker_numric_opt(value, env, "sum")
def worker_numric_avg(value, env="mpi"):
"""R
"""
return worker_numric_sum(value, env) / fleet.worker_num()
def worker_numric_min(value, env="mpi"):
"""R
"""
return wroker_numric_opt(value, env, "min")
def worker_numric_max(value, env="mpi"):
"""R
"""
return wroker_numric_opt(value, env, "max")
class CtrTrainer(Trainer):
"""R
"""
def __init__(self, config):
"""R
"""
Trainer.__init__(self, config)
config['output_path'] = util.get_absolute_path(config['output_path'],
config['io']['afs'])
self.global_config = config
self._metrics = {}
self._path_generator = util.PathGenerator({
'templates': [{
'name': 'xbox_base_done',
'template': config['output_path'] + '/xbox_base_done.txt'
}, {
'name': 'xbox_delta_done',
'template': config['output_path'] + '/xbox_patch_done.txt'
}, {
'name': 'xbox_base',
'template': config['output_path'] + '/xbox/{day}/base/'
}, {
'name': 'xbox_delta',
'template':
config['output_path'] + '/xbox/{day}/delta-{pass_id}/'
}, {
'name': 'batch_model',
'template':
config['output_path'] + '/batch_model/{day}/{pass_id}/'
}]
})
if 'path_generator' in config:
self._path_generator.add_path_template(config['path_generator'])
self.regist_context_processor('uninit', self.init)
self.regist_context_processor('startup', self.startup)
self.regist_context_processor('begin_day', self.begin_day)
self.regist_context_processor('train_pass', self.train_pass)
self.regist_context_processor('end_day', self.end_day)
def init(self, context):
"""R
"""
role_maker = None
if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu':
afs_config = self.global_config['io']['afs']
role_maker = GeneralRoleMaker(
hdfs_name=afs_config['fs_name'],
hdfs_ugi=afs_config['fs_ugi'],
path=self.global_config['output_path'] + "/gloo",
init_timeout_seconds=1200,
run_timeout_seconds=1200)
fleet.init(role_maker)
data_var_list = []
data_var_name_dict = {}
runnnable_scope = []
runnnable_cost_op = []
context['status'] = 'startup'
for executor in self.global_config['executor']:
scope = fluid.Scope()
self._exector_context[executor['name']] = {}
self._exector_context[executor['name']]['scope'] = scope
self._exector_context[executor['name']][
'model'] = model_basic.create(executor)
model = self._exector_context[executor['name']]['model']
self._metrics.update(model.get_metrics())
runnnable_scope.append(scope)
runnnable_cost_op.append(model.get_avg_cost())
for var in model._data_var:
if var.name in data_var_name_dict:
continue
data_var_list.append(var)
data_var_name_dict[var.name] = var
optimizer = model_basic.YamlModel.build_optimizer({
'metrics': self._metrics,
'optimizer_conf': self.global_config['optimizer']
})
optimizer.minimize(runnnable_cost_op, runnnable_scope)
for executor in self.global_config['executor']:
scope = self._exector_context[executor['name']]['scope']
model = self._exector_context[executor['name']]['model']
program = model._build_param['model']['train_program']
if not executor['is_update_sparse']:
program._fleet_opt["program_configs"][str(
id(model.get_avg_cost().block.program))][
"push_sparse"] = []
if 'train_thread_num' not in executor:
executor['train_thread_num'] = self.global_config[
'train_thread_num']
with fluid.scope_guard(scope):
self._exe.run(model._build_param['model']['startup_program'])
model.dump_model_program('./')
# server init done
if fleet.is_server():
return 0
self._dataset = {}
for dataset_item in self.global_config['dataset']['data_list']:
dataset_item['data_vars'] = data_var_list
dataset_item.update(self.global_config['io']['afs'])
dataset_item["batch_size"] = self.global_config['batch_size']
self._dataset[dataset_item[
'name']] = dataset.FluidTimeSplitDataset(dataset_item)
# if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
# util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3)
fleet.init_worker()
pass
def print_log(self, log_str, params):
"""R
"""
params['index'] = fleet.worker_index()
if params['master']:
if fleet.worker_index() == 0:
print(log_str)
sys.stdout.flush()
else:
print(log_str)
if 'stdout' in params:
params['stdout'] += str(datetime.datetime.now()) + log_str
def print_global_metrics(self, scope, model, monitor_data, stdout_str):
"""R
"""
metrics = model.get_metrics()
metric_calculator = AUCMetric(None)
for metric in metrics:
metric_param = {'label': metric, 'metric_dict': metrics[metric]}
metric_calculator.calculate(scope, metric_param)
metric_result = metric_calculator.get_result_to_string()
self.print_log(metric_result,
{'master': True,
'stdout': stdout_str})
monitor_data += metric_result
metric_calculator.clear(scope, metric_param)
def save_model(self, day, pass_index, base_key):
"""R
"""
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save model cost %s sec'
})
model_path = self._path_generator.generate_path(
'batch_model', {'day': day,
'pass_id': pass_index})
save_mode = 0 # just save all
if pass_index < 1: # batch_model
save_mode = 3 # unseen_day++, save all
util.rank0_print("going to save_model %s" % model_path)
fleet.save_persistables(None, model_path, mode=save_mode)
if fleet._role_maker.is_first_worker():
self._train_pass.save_train_progress(
day, pass_index, base_key, model_path, is_checkpoint=True)
cost_printer.done()
return model_path
def save_xbox_model(self, day, pass_index, xbox_base_key, monitor_data):
"""R
"""
stdout_str = ""
xbox_patch_id = str(int(time.time()))
util.rank0_print("begin save delta model")
model_path = ""
xbox_model_donefile = ""
cost_printer = util.CostPrinter(util.print_cost, {'master': True, \
'log_format': 'save xbox model cost %s sec',
'stdout': stdout_str})
if pass_index < 1:
save_mode = 2
xbox_patch_id = xbox_base_key
model_path = self._path_generator.generate_path('xbox_base',
{'day': day})
xbox_model_donefile = self._path_generator.generate_path(
'xbox_base_done', {'day': day})
else:
save_mode = 1
model_path = self._path_generator.generate_path(
'xbox_delta', {'day': day,
'pass_id': pass_index})
xbox_model_donefile = self._path_generator.generate_path(
'xbox_delta_done', {'day': day})
total_save_num = fleet.save_persistables(
None, model_path, mode=save_mode)
cost_printer.done()
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save cache model cost %s sec',
'stdout': stdout_str
})
model_file_handler = fs.FileHandler(self.global_config['io']['afs'])
if self.global_config['save_cache_model']:
cache_save_num = fleet.save_cache_model(
None, model_path, mode=save_mode)
model_file_handler.write(
"file_prefix:part\npart_num:16\nkey_num:%d\n" % cache_save_num,
model_path + '/000_cache/sparse_cache.meta', 'w')
cost_printer.done()
util.rank0_print("save xbox cache model done, key_num=%s" %
cache_save_num)
save_env_param = {'executor': self._exe, 'save_combine': True}
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save dense model cost %s sec',
'stdout': stdout_str
})
if fleet._role_maker.is_first_worker():
for executor in self.global_config['executor']:
if 'layer_for_inference' not in executor:
continue
executor_name = executor['name']
model = self._exector_context[executor_name]['model']
save_env_param['inference_list'] = executor[
'layer_for_inference']
save_env_param['scope'] = self._exector_context[executor_name][
'scope']
model.dump_inference_param(save_env_param)
for dnn_layer in executor['layer_for_inference']:
model_file_handler.cp(dnn_layer['save_file_name'],
model_path + '/dnn_plugin/' +
dnn_layer['save_file_name'])
fleet._role_maker._barrier_worker()
cost_printer.done()
xbox_done_info = {
"id": xbox_patch_id,
"key": xbox_base_key,
"ins_path": "",
"ins_tag": "feasign",
"partition_type": "2",
"record_count": "111111",
"monitor_data": monitor_data,
"mpi_size": str(fleet.worker_num()),
"input": model_path.rstrip("/") + "/000",
"job_id": util.get_env_value("JOB_ID"),
"job_name": util.get_env_value("JOB_NAME")
}
if fleet._role_maker.is_first_worker():
model_file_handler.write(
json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a')
if pass_index > 0:
self._train_pass.save_train_progress(
day,
pass_index,
xbox_base_key,
model_path,
is_checkpoint=False)
fleet._role_maker._barrier_worker()
return stdout_str
def run_executor(self, executor_config, dataset, stdout_str):
"""R
"""
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
xbox_base_key = self._train_pass._base_key
executor_name = executor_config['name']
scope = self._exector_context[executor_name]['scope']
model = self._exector_context[executor_name]['model']
with fluid.scope_guard(scope):
util.rank0_print("Begin " + executor_name + " pass")
begin = time.time()
program = model._build_param['model']['train_program']
self._exe.train_from_dataset(
program,
dataset,
scope,
thread=executor_config['train_thread_num'],
debug=self.global_config['debug'])
end = time.time()
local_cost = (end - begin) / 60.0
avg_cost = worker_numric_avg(local_cost)
min_cost = worker_numric_min(local_cost)
max_cost = worker_numric_max(local_cost)
util.rank0_print("avg train time %s mins, min %s mins, max %s mins"
% (avg_cost, min_cost, max_cost))
self._exector_context[executor_name]['cost'] = max_cost
monitor_data = ""
self.print_global_metrics(scope, model, monitor_data, stdout_str)
util.rank0_print("End " + executor_name + " pass")
if self._train_pass.need_dump_inference(
pass_id) and executor_config['dump_inference_model']:
stdout_str += self.save_xbox_model(day, pass_id, xbox_base_key,
monitor_data)
fleet._role_maker._barrier_worker()
def startup(self, context):
"""R
"""
if fleet.is_server():
fleet.run_server()
context['status'] = 'wait'
return
stdout_str = ""
self._train_pass = util.TimeTrainPass(self.global_config)
if not self.global_config['cold_start']:
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'load model cost %s sec',
'stdout': stdout_str
})
self.print_log("going to load model %s" %
self._train_pass._checkpoint_model_path,
{'master': True})
# if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= self._train_pass.date()
# and config.reqi_dnn_plugin_pass >= self._pass_id:
# fleet.load_one_table(0, self._train_pass._checkpoint_model_path)
# else:
fleet.init_server(self._train_pass._checkpoint_model_path, mode=0)
cost_printer.done()
if self.global_config['save_first_base']:
self.print_log("save_first_base=True", {'master': True})
self.print_log("going to save xbox base model",
{'master': True,
'stdout': stdout_str})
self._train_pass._base_key = int(time.time())
stdout_str += self.save_xbox_model(self._train_pass.date(), 0,
self._train_pass._base_key, "")
context['status'] = 'begin_day'
def begin_day(self, context):
"""R
"""
stdout_str = ""
if not self._train_pass.next():
context['is_exit'] = True
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
self.print_log("======== BEGIN DAY:%s ========" % day,
{'master': True,
'stdout': stdout_str})
if pass_id == self._train_pass.max_pass_num_day():
context['status'] = 'end_day'
else:
context['status'] = 'train_pass'
def end_day(self, context):
"""R
"""
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
xbox_base_key = int(time.time())
context['status'] = 'begin_day'
util.rank0_print("shrink table")
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'shrink table done, cost %s sec'
})
fleet.shrink_sparse_table()
for executor in self._exector_context:
self._exector_context[executor]['model'].shrink({
'scope': self._exector_context[executor]['scope'],
'decay': self.global_config['optimizer']['dense_decay_rate']
})
cost_printer.done()
next_date = self._train_pass.date(delta_day=1)
util.rank0_print("going to save xbox base model")
self.save_xbox_model(next_date, 0, xbox_base_key, "")
util.rank0_print("going to save batch model")
self.save_model(next_date, 0, xbox_base_key)
self._train_pass._base_key = xbox_base_key
fleet._role_maker._barrier_worker()
def train_pass(self, context):
"""R
"""
stdout_str = ""
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
base_key = self._train_pass._base_key
pass_time = self._train_pass._current_train_time.strftime("%Y%m%d%H%M")
self.print_log(" ==== begin delta:%s ========" % pass_id,
{'master': True,
'stdout': stdout_str})
train_begin_time = time.time()
cost_printer = util.CostPrinter(util.print_cost, \
{'master': True, 'log_format': 'load into memory done, cost %s sec',
'stdout': stdout_str})
current_dataset = {}
for name in self._dataset:
current_dataset[name] = self._dataset[name].load_dataset({
'node_num': fleet.worker_num(),
'node_idx': fleet.worker_index(),
'begin_time': pass_time,
'time_window_min': self._train_pass._interval_per_pass
})
fleet._role_maker._barrier_worker()
cost_printer.done()
util.rank0_print("going to global shuffle")
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'stdout': stdout_str,
'log_format': 'global shuffle done, cost %s sec'
})
for name in current_dataset:
current_dataset[name].global_shuffle(
fleet, self.global_config['dataset']['shuffle_thread'])
cost_printer.done()
# str(dataset.get_shuffle_data_size(fleet))
fleet._role_maker._barrier_worker()
if self.global_config['prefetch_data']:
next_pass_time = (
self._train_pass._current_train_time + datetime.timedelta(
minutes=self._train_pass._interval_per_pass)
).strftime("%Y%m%d%H%M")
for name in self._dataset:
self._dataset[name].preload_dataset({
'node_num': fleet.worker_num(),
'node_idx': fleet.worker_index(),
'begin_time': next_pass_time,
'time_window_min': self._train_pass._interval_per_pass
})
fleet._role_maker._barrier_worker()
pure_train_begin = time.time()
for executor in self.global_config['executor']:
self.run_executor(executor,
current_dataset[executor['dataset_name']],
stdout_str)
cost_printer = util.CostPrinter(util.print_cost, \
{'master': True, 'log_format': 'release_memory cost %s sec'})
for name in current_dataset:
current_dataset[name].release_memory()
pure_train_cost = time.time() - pure_train_begin
if self._train_pass.is_checkpoint_pass(pass_id):
self.save_model(day, pass_id, base_key)
train_end_time = time.time()
train_cost = train_end_time - train_begin_time
other_cost = train_cost - pure_train_cost
log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % (
day, pass_id, train_cost)
for executor in self._exector_context:
log_str += '[' + executor + ':' + str(self._exector_context[
executor]['cost']) + ']'
log_str += '[other_cost:' + str(other_cost) + ']'
util.rank0_print(log_str)
stdout_str += util.now_time_str() + log_str
sys.stdout.write(stdout_str)
fleet._role_maker._barrier_worker()
stdout_str = ""
if pass_id == self._train_pass.max_pass_num_day():
context['status'] = 'end_day'
return
elif not self._train_pass.next():
context['is_exit'] = True
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer): ...@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance) self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init) self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup) self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train) self.regist_context_processor('train_pass', self.dataset_train)
else: else:
self.regist_context_processor( self.regist_context_processor('train_pass',
'train_pass', self.dataloader_train) self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal) self.regist_context_processor('terminal_pass', self.terminal)
...@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer): ...@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer):
if state == "TRAIN": if state == "TRAIN":
inputs = self.model.get_inputs() inputs = self.model.get_inputs()
namespace = "train.reader" namespace = "train.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("train_data_path", None,
"train_data_path", None, namespace) namespace)
else: else:
inputs = self.model.get_infer_inputs() inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader" namespace = "evaluate.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("test_data_path", None,
"test_data_path", None, namespace) namespace)
threads = int(envs.get_runtime_environ("train.trainer.threads")) threads = int(envs.get_runtime_environ("train.trainer.threads"))
batch_size = envs.get_global_env("batch_size", None, namespace) batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace) reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__)) abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format( pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
reader, reader_class, state, self._config_yaml) self._config_yaml)
if train_data_path.startswith("paddlerec::"): if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE") package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None assert package_base is not None
train_data_path = os.path.join( train_data_path = os.path.join(package_base,
package_base, train_data_path.split("::")[1]) train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset() dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs) dataset.set_use_var(inputs)
...@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer): ...@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer):
ins = self._get_dataset_ins() ins = self._get_dataset_ins()
begin_time = time.time() begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(), self._exe.train_from_dataset(
dataset=dataset, program=fluid.default_main_program(),
fetch_list=self.fetch_vars, dataset=dataset,
fetch_info=self.fetch_alias, fetch_list=self.fetch_vars,
print_period=self.fetch_period) fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time() end_time = time.time()
times = end_time-begin_time times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True) self.save(i, "train", is_fleet=True)
fleet.stop_worker() fleet.stop_worker()
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer): ...@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance) self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init) self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup) self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None,
"train.reader") != "DataLoader": if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train) self.regist_context_processor('train_pass', self.dataset_train)
else: else:
self.regist_context_processor('train_pass', self.dataloader_train) self.regist_context_processor('train_pass', self.dataloader_train)
...@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer): ...@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer):
reader = self._get_dataloader("TRAIN") reader = self._get_dataloader("TRAIN")
epochs = envs.get_global_env("train.epochs") epochs = envs.get_global_env("train.epochs")
program = fluid.compiler.CompiledProgram( program = fluid.compiler.CompiledProgram(fluid.default_main_program(
fluid.default_main_program()).with_data_parallel( )).with_data_parallel(loss_name=self.model.get_avg_cost().name)
loss_name=self.model.get_avg_cost().name)
metrics_varnames = [] metrics_varnames = []
metrics_format = [] metrics_format = []
...@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer): ...@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer):
batch_id = 0 batch_id = 0
try: try:
while True: while True:
metrics_rets = self._exe.run( metrics_rets = self._exe.run(program=program,
program=program, fetch_list=metrics_varnames)
fetch_list=metrics_varnames)
metrics = [epoch, batch_id] metrics = [epoch, batch_id]
metrics.extend(metrics_rets) metrics.extend(metrics_rets)
...@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer): ...@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer):
epochs = envs.get_global_env("train.epochs") epochs = envs.get_global_env("train.epochs")
for i in range(epochs): for i in range(epochs):
begin_time = time.time() begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(), self._exe.train_from_dataset(
dataset=dataset, program=fluid.default_main_program(),
fetch_list=self.fetch_vars, dataset=dataset,
fetch_info=self.fetch_alias, fetch_list=self.fetch_vars,
print_period=self.fetch_period) fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time() end_time = time.time()
times = end_time - begin_time times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times)) print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=False) self.save(i, "train", is_fleet=False)
context['status'] = 'infer_pass' context['status'] = 'infer_pass'
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"] ...@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"]
class TDMClusterTrainer(ClusterTrainer): class TDMClusterTrainer(ClusterTrainer):
def server(self, context): def server(self, context):
namespace = "train.startup" namespace = "train.startup"
init_model_path = envs.get_global_env( init_model_path = envs.get_global_env("cluster.init_model_path", "",
"cluster.init_model_path", "", namespace) namespace)
assert init_model_path != "", "Cluster train must has init_model for TDM" assert init_model_path != "", "Cluster train must has init_model for TDM"
fleet.init_server(init_model_path) fleet.init_server(init_model_path)
logger.info("TDM: load model from {}".format(init_model_path)) logger.info("TDM: load model from {}".format(init_model_path))
...@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer): ...@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer):
self._exe.run(fleet.startup_program) self._exe.run(fleet.startup_program)
namespace = "train.startup" namespace = "train.startup"
load_tree = envs.get_global_env( load_tree = envs.get_global_env("tree.load_tree", True, namespace)
"tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env( self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
"tree.tree_layer_path", "", namespace) namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace) self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
self.tree_info_path = envs.get_global_env( "", namespace)
"tree.tree_info_path", "", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
save_init_model = envs.get_global_env( namespace)
"cluster.save_init_model", False, namespace)
init_model_path = envs.get_global_env( save_init_model = envs.get_global_env("cluster.save_init_model", False,
"cluster.init_model_path", "", namespace) namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
if load_tree: if load_tree:
# covert tree to tensor, set it into Fluid's variable. # covert tree to tensor, set it into Fluid's variable.
for param_name in special_param: for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor() param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name) param_array = self._tdm_prepare(param_name)
param_t.set(param_array.astype('int32'), self._place) param_t.set(param_array.astype('int32'), self._place)
...@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer): ...@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer):
def _tdm_travel_prepare(self): def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file""" """load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path) travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format( logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
travel_array.shape[0])) 0]))
return travel_array return travel_array
def _tdm_layer_prepare(self): def _tdm_layer_prepare(self):
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with one node only. Training use fluid with one node only.
""" """
...@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs ...@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", special_param = [
"TDM_Tree_Info", "TDM_Tree_Emb"] "TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info", "TDM_Tree_Emb"
]
class TDMSingleTrainer(SingleTrainer): class TDMSingleTrainer(SingleTrainer):
def startup(self, context): def startup(self, context):
namespace = "train.startup" namespace = "train.startup"
load_persistables = envs.get_global_env( load_persistables = envs.get_global_env("single.load_persistables",
"single.load_persistables", False, namespace) False, namespace)
persistables_model_path = envs.get_global_env( persistables_model_path = envs.get_global_env(
"single.persistables_model_path", "", namespace) "single.persistables_model_path", "", namespace)
load_tree = envs.get_global_env( load_tree = envs.get_global_env("tree.load_tree", False, namespace)
"tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env( self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
"tree.tree_layer_path", "", namespace) namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace) self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
self.tree_info_path = envs.get_global_env( "", namespace)
"tree.tree_info_path", "", namespace)
self.tree_emb_path = envs.get_global_env( self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
"tree.tree_emb_path", "", namespace) namespace)
save_init_model = envs.get_global_env( self.tree_emb_path = envs.get_global_env("tree.tree_emb_path", "",
"single.save_init_model", False, namespace) namespace)
init_model_path = envs.get_global_env(
"single.init_model_path", "", namespace) save_init_model = envs.get_global_env("single.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("single.init_model_path", "",
namespace)
self._exe.run(fluid.default_startup_program()) self._exe.run(fluid.default_startup_program())
if load_persistables: if load_persistables:
...@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer): ...@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer):
if load_tree: if load_tree:
# covert tree to tensor, set it into Fluid's variable. # covert tree to tensor, set it into Fluid's variable.
for param_name in special_param: for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor() param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name) param_array = self._tdm_prepare(param_name)
if param_name == 'TDM_Tree_Emb': if param_name == 'TDM_Tree_Emb':
param_t.set(param_array.astype('float32'), self._place) param_t.set(param_array.astype('float32'), self._place)
...@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer): ...@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer):
def _tdm_travel_prepare(self): def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file""" """load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path) travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format( logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
travel_array.shape[0])) 0]))
return travel_array return travel_array
def _tdm_emb_prepare(self): def _tdm_emb_prepare(self):
"""load tdm tree param from npy/list file""" """load tdm tree param from npy/list file"""
emb_array = np.load(self.tree_emb_path) emb_array = np.load(self.tree_emb_path)
logger.info("TDM Tree node nums from emb: {}".format( logger.info("TDM Tree node nums from emb: {}".format(emb_array.shape[
emb_array.shape[0])) 0]))
return emb_array return emb_array
def _tdm_layer_prepare(self): def _tdm_layer_prepare(self):
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Training use fluid with DistributeTranspiler Training use fluid with DistributeTranspiler
""" """
...@@ -39,9 +38,12 @@ class TranspileTrainer(Trainer): ...@@ -39,9 +38,12 @@ class TranspileTrainer(Trainer):
self.increment_models = [] self.increment_models = []
def processor_register(self): def processor_register(self):
print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first") print(
"Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first"
)
def _get_dataloader(self, state="TRAIN"): def _get_dataloader(self, state="TRAIN"):
if state == "TRAIN": if state == "TRAIN":
dataloader = self.model._data_loader dataloader = self.model._data_loader
namespace = "train.reader" namespace = "train.reader"
...@@ -59,12 +61,14 @@ class TranspileTrainer(Trainer): ...@@ -59,12 +61,14 @@ class TranspileTrainer(Trainer):
if sparse_slots is None and dense_slots is None: if sparse_slots is None and dense_slots is None:
reader_class = envs.get_global_env("class", None, namespace) reader_class = envs.get_global_env("class", None, namespace)
reader = dataloader_instance.dataloader( reader = dataloader_instance.dataloader(reader_class, state,
reader_class, state, self._config_yaml) self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class, class_name) reader_class = envs.lazy_instance_by_fliename(reader_class,
class_name)
reader_ins = reader_class(self._config_yaml) reader_ins = reader_class(self._config_yaml)
else: else:
reader = dataloader_instance.slotdataloader("", state, self._config_yaml) reader = dataloader_instance.slotdataloader("", state,
self._config_yaml)
reader_ins = SlotReader(self._config_yaml) reader_ins = SlotReader(self._config_yaml)
if hasattr(reader_ins, 'generate_batch_from_trainfiles'): if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
...@@ -94,13 +98,13 @@ class TranspileTrainer(Trainer): ...@@ -94,13 +98,13 @@ class TranspileTrainer(Trainer):
if state == "TRAIN": if state == "TRAIN":
inputs = self.model.get_inputs() inputs = self.model.get_inputs()
namespace = "train.reader" namespace = "train.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("train_data_path", None,
"train_data_path", None, namespace) namespace)
else: else:
inputs = self.model.get_infer_inputs() inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader" namespace = "evaluate.reader"
train_data_path = envs.get_global_env( train_data_path = envs.get_global_env("test_data_path", None,
"test_data_path", None, namespace) namespace)
sparse_slots = envs.get_global_env("sparse_slots", None, namespace) sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
dense_slots = envs.get_global_env("dense_slots", None, namespace) dense_slots = envs.get_global_env("dense_slots", None, namespace)
...@@ -112,8 +116,8 @@ class TranspileTrainer(Trainer): ...@@ -112,8 +116,8 @@ class TranspileTrainer(Trainer):
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
if sparse_slots is None and dense_slots is None: if sparse_slots is None and dense_slots is None:
pipe_cmd = "python {} {} {} {}".format( pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
reader, reader_class, state, self._config_yaml) self._config_yaml)
else: else:
padding = envs.get_global_env("padding", 0, namespace) padding = envs.get_global_env("padding", 0, namespace)
pipe_cmd = "python {} {} {} {} {} {} {} {}".format( pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
...@@ -123,8 +127,8 @@ class TranspileTrainer(Trainer): ...@@ -123,8 +127,8 @@ class TranspileTrainer(Trainer):
if train_data_path.startswith("paddlerec::"): if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE") package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None assert package_base is not None
train_data_path = os.path.join( train_data_path = os.path.join(package_base,
package_base, train_data_path.split("::")[1]) train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset() dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs) dataset.set_use_var(inputs)
...@@ -140,11 +144,11 @@ class TranspileTrainer(Trainer): ...@@ -140,11 +144,11 @@ class TranspileTrainer(Trainer):
debug_mode = envs.get_global_env("reader_debug_mode", False, namespace) debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
if debug_mode: if debug_mode:
print( print("--- Dataset Debug Mode Begin , show pre 10 data of {}---".
"--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0])) format(file_list[0]))
os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd)) os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
print( print("--- Dataset Debug Mode End , show pre 10 data of {}---".
"--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0])) format(file_list[0]))
exit(0) exit(0)
return dataset return dataset
...@@ -166,27 +170,29 @@ class TranspileTrainer(Trainer): ...@@ -166,27 +170,29 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False): if not need_save(epoch_id, save_interval, False):
return return
feed_varnames = envs.get_global_env( feed_varnames = envs.get_global_env("save.inference.feed_varnames",
"save.inference.feed_varnames", None, namespace) None, namespace)
fetch_varnames = envs.get_global_env( fetch_varnames = envs.get_global_env(
"save.inference.fetch_varnames", None, namespace) "save.inference.fetch_varnames", None, namespace)
if feed_varnames is None or fetch_varnames is None: if feed_varnames is None or fetch_varnames is None:
return return
fetch_vars = [fluid.default_main_program().global_block().vars[varname] fetch_vars = [
for varname in fetch_varnames] fluid.default_main_program().global_block().vars[varname]
dirname = envs.get_global_env( for varname in fetch_varnames
"save.inference.dirname", None, namespace) ]
dirname = envs.get_global_env("save.inference.dirname", None,
namespace)
assert dirname is not None assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id)) dirname = os.path.join(dirname, str(epoch_id))
if is_fleet: if is_fleet:
fleet.save_inference_model( fleet.save_inference_model(self._exe, dirname, feed_varnames,
self._exe, dirname, feed_varnames, fetch_vars) fetch_vars)
else: else:
fluid.io.save_inference_model( fluid.io.save_inference_model(dirname, feed_varnames,
dirname, feed_varnames, fetch_vars, self._exe) fetch_vars, self._exe)
self.inference_models.append((epoch_id, dirname)) self.inference_models.append((epoch_id, dirname))
def save_persistables(): def save_persistables():
...@@ -196,8 +202,8 @@ class TranspileTrainer(Trainer): ...@@ -196,8 +202,8 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False): if not need_save(epoch_id, save_interval, False):
return return
dirname = envs.get_global_env( dirname = envs.get_global_env("save.increment.dirname", None,
"save.increment.dirname", None, namespace) namespace)
assert dirname is not None assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id)) dirname = os.path.join(dirname, str(epoch_id))
...@@ -275,10 +281,9 @@ class TranspileTrainer(Trainer): ...@@ -275,10 +281,9 @@ class TranspileTrainer(Trainer):
batch_id = 0 batch_id = 0
try: try:
while True: while True:
metrics_rets = self._exe.run( metrics_rets = self._exe.run(program=program,
program=program, fetch_list=metrics_varnames,
fetch_list=metrics_varnames, return_numpy=is_return_numpy)
return_numpy=is_return_numpy)
metrics = [epoch, batch_id] metrics = [epoch, batch_id]
metrics.extend(metrics_rets) metrics.extend(metrics_rets)
......
...@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util ...@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util
class DatasetHolder(object): class DatasetHolder(object):
""" """
Dataset Base Dataset Holder
""" """
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
...@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder):
Dataset.__init__(self, config) Dataset.__init__(self, config)
if 'data_donefile' not in config or config['data_donefile'] is None: if 'data_donefile' not in config or config['data_donefile'] is None:
config['data_donefile'] = config['data_path'] + "/to.hadoop.done" config['data_donefile'] = config['data_path'] + "/to.hadoop.done"
self._path_generator = util.PathGenerator({'templates': [ self._path_generator = util.PathGenerator({
{'name': 'data_path', 'template': config['data_path']}, 'templates': [{
{'name': 'donefile_path', 'template': config['data_donefile']} 'name': 'data_path',
]}) 'template': config['data_path']
self._split_interval = config['split_interval'] # data split N mins per dir }, {
'name': 'donefile_path',
'template': config['data_donefile']
}]
})
self._split_interval = config[
'split_interval'] # data split N mins per dir
self._data_file_handler = fs.FileHandler(config) self._data_file_handler = fs.FileHandler(config)
def _format_data_time(self, daytime_str, time_window_mins): def _format_data_time(self, daytime_str, time_window_mins):
...@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder):
return None, 0 return None, 0
if mins_of_day % self._split_interval != 0: if mins_of_day % self._split_interval != 0:
skip_mins = self._split_interval - (mins_of_day % self._split_interval) skip_mins = self._split_interval - (mins_of_day %
self._split_interval)
data_time = data_time + datetime.timedelta(minutes=skip_mins) data_time = data_time + datetime.timedelta(minutes=skip_mins)
time_window_mins = time_window_mins - skip_mins time_window_mins = time_window_mins - skip_mins
return data_time, time_window_mins return data_time, time_window_mins
...@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder):
True/False True/False
""" """
is_ready = True is_ready = True
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0: while time_window_mins > 0:
file_path = self._path_generator.generate_path('donefile_path', {'time_format': data_time}) file_path = self._path_generator.generate_path(
'donefile_path', {'time_format': data_time})
if not self._data_file_handler.is_exist(file_path): if not self._data_file_handler.is_exist(file_path):
is_ready = False is_ready = False
break break
time_window_mins = time_window_mins - self._split_interval time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval) data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return is_ready return is_ready
def get_file_list(self, daytime_str, time_window_mins, node_num=1, node_idx=0): def get_file_list(self,
daytime_str,
time_window_mins,
node_num=1,
node_idx=0):
""" """
data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx] data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx]
Args: Args:
...@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder):
list, data_shard[node_idx] list, data_shard[node_idx]
""" """
data_file_list = [] data_file_list = []
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0: while time_window_mins > 0:
file_path = self._path_generator.generate_path('data_path', {'time_format': data_time}) file_path = self._path_generator.generate_path(
'data_path', {'time_format': data_time})
sub_file_list = self._data_file_handler.ls(file_path) sub_file_list = self._data_file_handler.ls(file_path)
for sub_file in sub_file_list: for sub_file in sub_file_list:
sub_file_name = self._data_file_handler.get_file_name(sub_file) sub_file_name = self._data_file_handler.get_file_name(sub_file)
if not sub_file_name.startswith(self._config['filename_prefix']): if not sub_file_name.startswith(self._config[
'filename_prefix']):
continue continue
if hash(sub_file_name) % node_num == node_idx: if hash(sub_file_name) % node_num == node_idx:
data_file_list.append(sub_file) data_file_list.append(sub_file)
time_window_mins = time_window_mins - self._split_interval time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval) data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return data_file_list return data_file_list
def _alloc_dataset(self, file_list): def _alloc_dataset(self, file_list):
""" """ """ """
dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type']) dataset = fluid.DatasetFactory().create_dataset(self._config[
'dataset_type'])
dataset.set_batch_size(self._config['batch_size']) dataset.set_batch_size(self._config['batch_size'])
dataset.set_thread(self._config['load_thread']) dataset.set_thread(self._config['load_thread'])
dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi']) dataset.set_hdfs_config(self._config['fs_name'],
self._config['fs_ugi'])
dataset.set_pipe_command(self._config['data_converter']) dataset.set_pipe_command(self._config['data_converter'])
dataset.set_filelist(file_list) dataset.set_filelist(file_list)
dataset.set_use_var(self._config['data_vars']) dataset.set_use_var(self._config['data_vars'])
...@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder):
while self.check_ready(begin_time, windown_min) == False: while self.check_ready(begin_time, windown_min) == False:
print("dataset not ready, time:" + begin_time) print("dataset not ready, time:" + begin_time)
time.sleep(30) time.sleep(30)
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list) self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].load_into_memory() self._datasets[begin_time].load_into_memory()
else: else:
...@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder): ...@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min = params['time_window_min'] windown_min = params['time_window_min']
if begin_time not in self._datasets: if begin_time not in self._datasets:
if self.check_ready(begin_time, windown_min): if self.check_ready(begin_time, windown_min):
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list) self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].preload_into_memory(self._config['preload_thread']) self._datasets[begin_time].preload_into_memory(self._config[
'preload_thread'])
return True return True
return False return False
......
...@@ -17,10 +17,11 @@ import sys ...@@ -17,10 +17,11 @@ import sys
from paddlerec.core.utils.envs import lazy_instance_by_fliename from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.reader import SlotReader from paddlerec.core.reader import SlotReader
from paddlerec.core.utils import envs
if len(sys.argv) < 4: if len(sys.argv) < 4:
raise ValueError("reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path") raise ValueError(
"reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path"
)
reader_package = sys.argv[1] reader_package = sys.argv[1]
......
...@@ -95,7 +95,7 @@ def path_adapter(path): ...@@ -95,7 +95,7 @@ def path_adapter(path):
l_p = path.split("paddlerec.")[1].replace(".", "/") l_p = path.split("paddlerec.")[1].replace(".", "/")
return os.path.join(package, l_p) return os.path.join(package, l_p)
else: else:
return path return path
def windows_path_converter(path): def windows_path_converter(path):
...@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None): ...@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None):
def lazy_instance_by_package(package, class_name): def lazy_instance_by_package(package, class_name):
models = get_global_env("train.model.models") models = get_global_env("train.model.models")
model_package = __import__( model_package = __import__(package,
package, globals(), locals(), package.split(".")) globals(), locals(), package.split("."))
instance = getattr(model_package, class_name) instance = getattr(model_package, class_name)
return instance return instance
...@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name): ...@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name):
sys.path.append(dirname) sys.path.append(dirname)
package = os.path.splitext(os.path.basename(abs))[0] package = os.path.splitext(os.path.basename(abs))[0]
model_package = __import__( model_package = __import__(package,
package, globals(), locals(), package.split(".")) globals(), locals(), package.split("."))
instance = getattr(model_package, class_name) instance = getattr(model_package, class_name)
return instance return instance
...@@ -189,8 +189,7 @@ def get_platform(): ...@@ -189,8 +189,7 @@ def get_platform():
def find_free_port(): def find_free_port():
def __free_port(): def __free_port():
with closing(socket.socket(socket.AF_INET, with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
socket.SOCK_STREAM)) as s:
s.bind(('', 0)) s.bind(('', 0))
return s.getsockname()[1] return s.getsockname()[1]
......
...@@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient ...@@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
def is_afs_path(path): def is_afs_path(path):
"""R """is_afs_path
""" """
if path.startswith("afs") or path.startswith("hdfs"): if path.startswith("afs") or path.startswith("hdfs"):
return True return True
...@@ -133,8 +133,9 @@ class FileHandler(object): ...@@ -133,8 +133,9 @@ class FileHandler(object):
if mode.find('a') >= 0: if mode.find('a') >= 0:
org_content = self._hdfs_client.cat(dest_path) org_content = self._hdfs_client.cat(dest_path)
content = content + org_content content = content + org_content
self._local_fs_client.write(content, temp_local_file, self._local_fs_client.write(
mode) # fleet hdfs_client only support upload, so write tmp file content, temp_local_file, mode
) # fleet hdfs_client only support upload, so write tmp file
self._hdfs_client.delete(dest_path + ".tmp") self._hdfs_client.delete(dest_path + ".tmp")
self._hdfs_client.upload(dest_path + ".tmp", temp_local_file) self._hdfs_client.upload(dest_path + ".tmp", temp_local_file)
self._hdfs_client.delete(dest_path + ".bak") self._hdfs_client.delete(dest_path + ".bak")
...@@ -158,7 +159,8 @@ class FileHandler(object): ...@@ -158,7 +159,8 @@ class FileHandler(object):
files = [] files = []
if is_afs_path(path): if is_afs_path(path):
files = self._hdfs_client.ls(path) files = self._hdfs_client.ls(path)
files = [path + '/' + self.get_file_name(fi) for fi in files] # absulte path files = [path + '/' + self.get_file_name(fi)
for fi in files] # absulte path
else: else:
files = self._local_fs_client.ls(path) files = self._local_fs_client.ls(path)
files = [path + '/' + fi for fi in files] # absulte path files = [path + '/' + fi for fi in files] # absulte path
......
...@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs ...@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs
def save_program_proto(path, program=None): def save_program_proto(path, program=None):
if program is None: if program is None:
_program = fluid.default_main_program() _program = fluid.default_main_program()
else: else:
...@@ -175,7 +176,8 @@ class PathGenerator(object): ...@@ -175,7 +176,8 @@ class PathGenerator(object):
""" """
if template_name in self._templates: if template_name in self._templates:
if 'time_format' in param: if 'time_format' in param:
str = param['time_format'].strftime(self._templates[template_name]) str = param['time_format'].strftime(self._templates[
template_name])
return str.format(**param) return str.format(**param)
return self._templates[template_name].format(**param) return self._templates[template_name].format(**param)
else: else:
...@@ -198,31 +200,39 @@ class TimeTrainPass(object): ...@@ -198,31 +200,39 @@ class TimeTrainPass(object):
self._begin_day = make_datetime(day_fields[0].strip()) self._begin_day = make_datetime(day_fields[0].strip())
if len(day_fields) == 1 or len(day_fields[1]) == 0: if len(day_fields) == 1 or len(day_fields[1]) == 0:
# 100 years, meaning to continuous running # 100 years, meaning to continuous running
self._end_day = self._begin_day + datetime.timedelta(days=36500) self._end_day = self._begin_day + datetime.timedelta(
days=36500)
else: else:
# example: 2020212+10 # example: 2020212+10
run_day = int(day_fields[1].strip()) run_day = int(day_fields[1].strip())
self._end_day = self._begin_day + datetime.timedelta(days=run_day) self._end_day = self._begin_day + datetime.timedelta(
days=run_day)
else: else:
# example: {20191001..20191031} # example: {20191001..20191031}
days = os.popen("echo -n " + self._config['days']).read().split(" ") days = os.popen("echo -n " + self._config['days']).read().split(
" ")
self._begin_day = make_datetime(days[0]) self._begin_day = make_datetime(days[0])
self._end_day = make_datetime(days[len(days) - 1]) self._end_day = make_datetime(days[len(days) - 1])
self._checkpoint_interval = self._config['checkpoint_interval'] self._checkpoint_interval = self._config['checkpoint_interval']
self._dump_inference_interval = self._config['dump_inference_interval'] self._dump_inference_interval = self._config['dump_inference_interval']
self._interval_per_pass = self._config['train_time_interval'] # train N min data per pass self._interval_per_pass = self._config[
'train_time_interval'] # train N min data per pass
self._pass_id = 0 self._pass_id = 0
self._inference_pass_id = 0 self._inference_pass_id = 0
self._pass_donefile_handler = None self._pass_donefile_handler = None
if 'pass_donefile_name' in self._config: if 'pass_donefile_name' in self._config:
self._train_pass_donefile = global_config['output_path'] + '/' + self._config['pass_donefile_name'] self._train_pass_donefile = global_config[
'output_path'] + '/' + self._config['pass_donefile_name']
if fs.is_afs_path(self._train_pass_donefile): if fs.is_afs_path(self._train_pass_donefile):
self._pass_donefile_handler = fs.FileHandler(global_config['io']['afs']) self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['afs'])
else: else:
self._pass_donefile_handler = fs.FileHandler(global_config['io']['local_fs']) self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['local_fs'])
last_done = self._pass_donefile_handler.cat(self._train_pass_donefile).strip().split('\n')[-1] last_done = self._pass_donefile_handler.cat(
self._train_pass_donefile).strip().split('\n')[-1]
done_fileds = last_done.split('\t') done_fileds = last_done.split('\t')
if len(done_fileds) > 4: if len(done_fileds) > 4:
self._base_key = done_fileds[1] self._base_key = done_fileds[1]
...@@ -236,15 +246,18 @@ class TimeTrainPass(object): ...@@ -236,15 +246,18 @@ class TimeTrainPass(object):
""" """
return 24 * 60 / self._interval_per_pass return 24 * 60 / self._interval_per_pass
def save_train_progress(self, day, pass_id, base_key, model_path, is_checkpoint): def save_train_progress(self, day, pass_id, base_key, model_path,
is_checkpoint):
"""R """R
""" """
if is_checkpoint: if is_checkpoint:
self._checkpoint_pass_id = pass_id self._checkpoint_pass_id = pass_id
self._checkpoint_model_path = model_path self._checkpoint_model_path = model_path
done_content = "%s\t%s\t%s\t%s\t%d\n" % (day, base_key, done_content = "%s\t%s\t%s\t%s\t%d\n" % (
self._checkpoint_model_path, self._checkpoint_pass_id, pass_id) day, base_key, self._checkpoint_model_path,
self._pass_donefile_handler.write(done_content, self._train_pass_donefile, 'a') self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content,
self._train_pass_donefile, 'a')
pass pass
def init_pass_by_id(self, date_str, pass_id): def init_pass_by_id(self, date_str, pass_id):
...@@ -286,12 +299,14 @@ class TimeTrainPass(object): ...@@ -286,12 +299,14 @@ class TimeTrainPass(object):
if self._pass_id < 1: if self._pass_id < 1:
self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M")) self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M"))
else: else:
next_time = self._current_train_time + datetime.timedelta(minutes=self._interval_per_pass) next_time = self._current_train_time + datetime.timedelta(
minutes=self._interval_per_pass)
if (next_time - self._end_day).total_seconds() > 0: if (next_time - self._end_day).total_seconds() > 0:
has_next = False has_next = False
else: else:
self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M")) self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M"))
if has_next and (self._inference_pass_id < self._pass_id or self._pass_id < old_pass_id): if has_next and (self._inference_pass_id < self._pass_id or
self._pass_id < old_pass_id):
self._inference_pass_id = self._pass_id - 1 self._inference_pass_id = self._pass_id - 1
return has_next return has_next
...@@ -319,9 +334,11 @@ class TimeTrainPass(object): ...@@ -319,9 +334,11 @@ class TimeTrainPass(object):
Return: Return:
date(current_train_time + delta_day) date(current_train_time + delta_day)
""" """
return (self._current_train_time + datetime.timedelta(days=delta_day)).strftime("%Y%m%d") return (self._current_train_time + datetime.timedelta(days=delta_day)
).strftime("%Y%m%d")
def timestamp(self, delta_day=0): def timestamp(self, delta_day=0):
"""R """R
""" """
return (self._current_train_time + datetime.timedelta(days=delta_day)).timestamp() return (self._current_train_time + datetime.timedelta(days=delta_day)
).timestamp()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# PaddleRec Benchmark # PaddleRec Benchmark
> 占位 > 占位
\ No newline at end of file
# PaddleRec 贡献代码 # PaddleRec 贡献代码
> 占位 > 占位
\ No newline at end of file
...@@ -279,4 +279,4 @@ class Metric(object): ...@@ -279,4 +279,4 @@ class Metric(object):
pass pass
``` ```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py) 全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
\ No newline at end of file
...@@ -7,5 +7,3 @@ ...@@ -7,5 +7,3 @@
### K8S集群运行分布式 ### K8S集群运行分布式
> 占位 > 占位
# 常见问题FAQ # 常见问题FAQ
> 占位 > 占位
\ No newline at end of file
# PaddleRec 单机训练 # PaddleRec 单机训练
> 占位 > 占位
\ No newline at end of file
...@@ -12,4 +12,3 @@ ...@@ -12,4 +12,3 @@
| 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ |
# PaddleRec 模型调参 # PaddleRec 模型调参
> 占位 > 占位
\ No newline at end of file
# PaddleRec 离线预测 # PaddleRec 离线预测
\ No newline at end of file
...@@ -5,4 +5,3 @@ ...@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839) ## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -37,4 +37,3 @@ train: ...@@ -37,4 +37,3 @@ train:
dirname: "inference" dirname: "inference"
epoch_interval: 100 epoch_interval: 100
save_last: True save_last: True
...@@ -31,7 +31,8 @@ class Model(ModelBase): ...@@ -31,7 +31,8 @@ class Model(ModelBase):
def train_net(self): def train_net(self):
""" network definition """ """ network definition """
data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64') data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
...@@ -51,7 +52,9 @@ class Model(ModelBase): ...@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer # full connect layer
fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim) fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim)
# softmax layer # softmax layer
prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax") prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label) acc = fluid.layers.accuracy(input=prediction, label=label)
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys import sys
from paddlerec.core.reader import Reader from paddlerec.core.reader import Reader
...@@ -38,7 +37,8 @@ class TrainReader(Reader): ...@@ -38,7 +37,8 @@ class TrainReader(Reader):
data = [int(i) for i in data] data = [int(i) for i in data]
label = [int(i) for i in label] label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len] seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)] yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter return data_iter
...@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification ...@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: | | :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- | | ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- | | -- | Classification | -- | -- | -- | -- |
...@@ -47,4 +47,3 @@ train: ...@@ -47,4 +47,3 @@ train:
dirname: "inference" dirname: "inference"
epoch_interval: 100 epoch_interval: 100
save_last: True save_last: True
...@@ -26,8 +26,10 @@ class Model(ModelBase): ...@@ -26,8 +26,10 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
self.cost = None self.cost = None
self.metrics = {} self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace) self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace) self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace) self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace) self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace) self.win_size = envs.get_global_env("win_size", None, self._namespace)
...@@ -35,8 +37,9 @@ class Model(ModelBase): ...@@ -35,8 +37,9 @@ class Model(ModelBase):
self.neg_size = envs.get_global_env("neg_size", None, self._namespace) self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
def train_net(self): def train_net(self):
""" network definition """ """ network"""
text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64') text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data( pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data( neg_tag = fluid.data(
...@@ -45,13 +48,19 @@ class Model(ModelBase): ...@@ -45,13 +48,19 @@ class Model(ModelBase):
self._data_var = [text, pos_tag, neg_tag] self._data_var = [text, pos_tag, neg_tag]
text_emb = fluid.embedding( text_emb = fluid.embedding(
input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb") input=text,
size=[self.vocab_text_size, self.emb_dim],
param_attr="text_emb")
text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) text_emb = fluid.layers.squeeze(input=text_emb, axes=[1])
pos_tag_emb = fluid.embedding( pos_tag_emb = fluid.embedding(
input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") input=pos_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1])
neg_tag_emb = fluid.embedding( neg_tag_emb = fluid.embedding(
input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") input=neg_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1])
conv_1d = fluid.nets.sequence_conv_pool( conv_1d = fluid.nets.sequence_conv_pool(
...@@ -65,7 +74,8 @@ class Model(ModelBase): ...@@ -65,7 +74,8 @@ class Model(ModelBase):
size=self.emb_dim, size=self.emb_dim,
param_attr="text_hid") param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid) cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_text_hid = fluid.layers.sequence_expand_as(
x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape( cos_neg_all = fluid.layers.sequence_reshape(
input=mul_cos_neg, new_dim=self.neg_size) input=mul_cos_neg, new_dim=self.neg_size)
...@@ -74,7 +84,10 @@ class Model(ModelBase): ...@@ -74,7 +84,10 @@ class Model(ModelBase):
#calculate hinge loss #calculate hinge loss
loss_part1 = nn.elementwise_sub( loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like( tensor.fill_constant_batch_size_like(
input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'), input=cos_pos,
shape=[-1, 1],
value=self.margin,
dtype='float32'),
cos_pos) cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max( loss_part3 = nn.elementwise_max(
...@@ -85,7 +98,7 @@ class Model(ModelBase): ...@@ -85,7 +98,7 @@ class Model(ModelBase):
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less) correct = nn.reduce_sum(less)
self.cost = avg_cost self.cost = avg_cost
self.metrics["correct"] = correct self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos self.metrics["cos_pos"] = cos_pos
...@@ -96,7 +109,8 @@ class Model(ModelBase): ...@@ -96,7 +109,8 @@ class Model(ModelBase):
return self.metrics return self.metrics
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer return sgd_optimizer
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys import sys
import numpy as np import numpy as np
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -23,13 +23,26 @@ class Model(ModelBase): ...@@ -23,13 +23,26 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def input(self): def input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace) self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) Neg = envs.get_global_env("hyper_parameters.NEG", None,
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self._namespace)
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i
in range(Neg)] self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self.doc_negs = [
fluid.data(
name="doc_neg_" + str(i),
shape=[-1, TRIGRAM_D],
dtype="float32",
lod_level=0) for i in range(Neg)
]
self._data_var.append(self.query) self._data_var.append(self.query)
self._data_var.append(self.doc_pos) self._data_var.append(self.doc_pos)
for input in self.doc_negs: for input in self.doc_negs:
...@@ -37,16 +50,24 @@ class Model(ModelBase): ...@@ -37,16 +50,24 @@ class Model(ModelBase):
if self._platform != "LINUX": if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self, is_infer=False): def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace) self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
self._namespace)
def fc(data, hidden_layers, hidden_acts, names): def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data] fc_inputs = [data]
for i in range(len(hidden_layers)): for i in range(len(hidden_layers)):
xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i]) xavier = fluid.initializer.Xavier(
uniform=True,
fan_in=fc_inputs[-1].shape[1],
fan_out=hidden_layers[i])
out = fluid.layers.fc(input=fc_inputs[-1], out = fluid.layers.fc(input=fc_inputs[-1],
size=hidden_layers[i], size=hidden_layers[i],
act=hidden_acts[i], act=hidden_acts[i],
...@@ -56,8 +77,10 @@ class Model(ModelBase): ...@@ -56,8 +77,10 @@ class Model(ModelBase):
fc_inputs.append(out) fc_inputs.append(out)
return fc_inputs[-1] return fc_inputs[-1]
query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3']) query_fc = fc(self.query, hidden_layers, hidden_acts,
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) ['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc) self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
if is_infer: if is_infer:
...@@ -65,13 +88,17 @@ class Model(ModelBase): ...@@ -65,13 +88,17 @@ class Model(ModelBase):
R_Q_D_ns = [] R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs): for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)]) 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i)) R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1) concat_Rs = fluid.layers.concat(
input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1) prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1]) hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob)) loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss) self.avg_cost = fluid.layers.mean(x=loss)
...@@ -91,18 +118,28 @@ class Model(ModelBase): ...@@ -91,18 +118,28 @@ class Model(ModelBase):
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.SGD(learning_rate) optimizer = fluid.optimizer.SGD(learning_rate)
return optimizer return optimizer
def infer_input(self): def infer_input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self._namespace)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self._infer_data_var = [self.query, self.doc_pos] self._infer_data_var = [self.query, self.doc_pos]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self): def infer_net(self):
self.infer_input() self.infer_input()
......
...@@ -22,4 +22,3 @@ mkdir -p data/train ...@@ -22,4 +22,3 @@ mkdir -p data/train
mkdir -p data/test mkdir -p data/test
python generate_synthetic_data.py python generate_synthetic_data.py
...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs ...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = [] self.all_slots = []
for i in range(self.query_slots): for i in range(self.query_slots):
......
...@@ -21,7 +21,11 @@ class Dataset: ...@@ -21,7 +21,11 @@ class Dataset:
class SyntheticDataset(Dataset): class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000): def __init__(self,
sparse_feature_dim,
query_slot_num,
title_slot_num,
dataset_size=10000):
# ids are randomly generated # ids are randomly generated
self.ids_per_slot = 10 self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim self.sparse_feature_dim = sparse_feature_dim
...@@ -46,14 +50,20 @@ class SyntheticDataset(Dataset): ...@@ -46,14 +50,20 @@ class SyntheticDataset(Dataset):
for i in range(self.title_slot_num): for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot, pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim) self.sparse_feature_dim)
pt_slot = [str(fea) + ':' + str(i + self.query_slot_num) for fea in pt_slot] pt_slot = [
str(fea) + ':' + str(i + self.query_slot_num)
for fea in pt_slot
]
pos_title_slots += pt_slot pos_title_slots += pt_slot
if is_train: if is_train:
for i in range(self.title_slot_num): for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot, nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim) self.sparse_feature_dim)
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in nt_slot = [
nt_slot] str(fea) + ':' +
str(i + self.query_slot_num + self.title_slot_num)
for fea in nt_slot
]
neg_title_slots += nt_slot neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots yield query_slots + pos_title_slots + neg_title_slots
else: else:
...@@ -76,7 +86,8 @@ if __name__ == '__main__': ...@@ -76,7 +86,8 @@ if __name__ == '__main__':
query_slots = 1 query_slots = 1
title_slots = 1 title_slots = 1
dataset_size = 10 dataset_size = 10
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size) dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
dataset_size)
train_reader = dataset.train() train_reader = dataset.train()
test_reader = dataset.test() test_reader = dataset.test()
......
...@@ -103,12 +103,18 @@ class Model(ModelBase): ...@@ -103,12 +103,18 @@ class Model(ModelBase):
def init_config(self): def init_config(self):
self._fetch_interval = 1 self._fetch_interval = 1
query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace) query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace) None, self._namespace)
query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace) title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim", None, self._namespace) None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None, self._namespace) query_encode_dim = envs.get_global_env(
title_slots = envs.get_global_env("hyper_parameters.title_slots", None, self._namespace) "hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
self._namespace)
factory = SimpleEncoderFactory() factory = SimpleEncoderFactory()
self.query_encoders = [ self.query_encoders = [
factory.create(query_encoder, query_encode_dim) factory.create(query_encoder, query_encode_dim)
...@@ -119,10 +125,13 @@ class Model(ModelBase): ...@@ -119,10 +125,13 @@ class Model(ModelBase):
for i in range(title_slots) for i in range(title_slots)
] ]
self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) self.emb_size = envs.get_global_env(
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) "hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
None, self._namespace)
self.emb_shape = [self.emb_size, self.emb_dim] self.emb_shape = [self.emb_size, self.emb_dim]
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
None, self._namespace)
self.margin = 0.1 self.margin = 0.1
def input(self, is_train=True): def input(self, is_train=True):
...@@ -133,8 +142,10 @@ class Model(ModelBase): ...@@ -133,8 +142,10 @@ class Model(ModelBase):
] ]
self.pt_slots = [ self.pt_slots = [
fluid.data( fluid.data(
name="%d" % (i + len(self.query_encoders)), shape=[None, 1], lod_level=1, dtype='int64') name="%d" % (i + len(self.query_encoders)),
for i in range(len(self.title_encoders)) shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
] ]
if is_train == False: if is_train == False:
...@@ -142,9 +153,11 @@ class Model(ModelBase): ...@@ -142,9 +153,11 @@ class Model(ModelBase):
self.nt_slots = [ self.nt_slots = [
fluid.data( fluid.data(
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, name="%d" %
dtype='int64') (i + len(self.query_encoders) + len(self.title_encoders)),
for i in range(len(self.title_encoders)) shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
] ]
return self.q_slots + self.pt_slots + self.nt_slots return self.q_slots + self.pt_slots + self.nt_slots
...@@ -153,11 +166,15 @@ class Model(ModelBase): ...@@ -153,11 +166,15 @@ class Model(ModelBase):
res = self.input() res = self.input()
self._data_var = res self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader: if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def get_acc(self, x, y): def get_acc(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32') less = tensor.cast(cf.less_than(x, y), dtype='float32')
...@@ -190,10 +207,12 @@ class Model(ModelBase): ...@@ -190,10 +207,12 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
] ]
pt_encodes = [ pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
] ]
nt_encodes = [ nt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs) self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
] ]
# concat multi view for query, pos_title, neg_title # concat multi view for query, pos_title, neg_title
...@@ -252,7 +271,8 @@ class Model(ModelBase): ...@@ -252,7 +271,8 @@ class Model(ModelBase):
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
return optimizer return optimizer
...@@ -261,7 +281,10 @@ class Model(ModelBase): ...@@ -261,7 +281,10 @@ class Model(ModelBase):
self._infer_data_var = res self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self): def infer_net(self):
self.infer_input() self.infer_input()
...@@ -281,7 +304,8 @@ class Model(ModelBase): ...@@ -281,7 +304,8 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
] ]
pt_encodes = [ pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
] ]
# concat multi view for query, pos_title, neg_title # concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes) q_concat = fluid.layers.concat(q_encodes)
......
...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs ...@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = [] self.all_slots = []
for i in range(self.query_slots): for i in range(self.query_slots):
......
...@@ -37,4 +37,3 @@ ...@@ -37,4 +37,3 @@
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
``` ```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader ...@@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', all_field_id = [
'129', '101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] '125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int) self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id): for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i] self.all_field_id_dict[field_id] = [False, i]
......
...@@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader ...@@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', all_field_id = [
'129', '101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] '125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int) self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id): for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i] self.all_field_id_dict[field_id] = [False, i]
......
...@@ -28,11 +28,13 @@ class Model(ModelBase): ...@@ -28,11 +28,13 @@ class Model(ModelBase):
init_stddev = 1.0 init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1]) scales = 1.0 / np.sqrt(data.shape[1])
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, p_attr = fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(loc=0.0, name='%s_weight' % tag,
scale=init_stddev * scales)) initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data, out = fluid.layers.fc(input=data,
size=out_dim, size=out_dim,
...@@ -44,7 +46,11 @@ class Model(ModelBase): ...@@ -44,7 +46,11 @@ class Model(ModelBase):
def input_data(self): def input_data(self):
sparse_input_ids = [ sparse_input_ids = [
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23) fluid.data(
name="field_" + str(i),
shape=[-1, 1],
dtype="int64",
lod_level=1) for i in range(0, 23)
] ]
label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64") label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64")
label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64") label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64")
...@@ -55,19 +61,23 @@ class Model(ModelBase): ...@@ -55,19 +61,23 @@ class Model(ModelBase):
def net(self, inputs, is_infer=False): def net(self, inputs, is_infer=False):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace) self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None,
self._namespace)
emb = [] emb = []
for data in inputs[0:-2]: for data in inputs[0:-2]:
feat_emb = fluid.embedding(input=data, feat_emb = fluid.embedding(
size=[vocab_size, embed_size], input=data,
param_attr=fluid.ParamAttr(name='dis_emb', size=[vocab_size, embed_size],
learning_rate=5, param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Xavier( name='dis_emb',
fan_in=embed_size, fan_out=embed_size) learning_rate=5,
), initializer=fluid.initializer.Xavier(
is_sparse=True) fan_in=embed_size, fan_out=embed_size)),
field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum') is_sparse=True)
field_emb = fluid.layers.sequence_pool(
input=feat_emb, pool_type='sum')
emb.append(field_emb) emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1) concat_emb = fluid.layers.concat(emb, axis=1)
...@@ -85,14 +95,20 @@ class Model(ModelBase): ...@@ -85,14 +95,20 @@ class Model(ModelBase):
ctr_clk = inputs[-2] ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1] ctcvr_buy = inputs[-1]
ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2]) ctr_prop_one = fluid.layers.slice(
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(
cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one,
ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1) cvr_prop_one)
ctcvr_prop = fluid.layers.concat(
input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(
input=ctcvr_prop, label=ctcvr_buy)
if is_infer: if is_infer:
self._infer_results["AUC_ctr"] = auc_ctr self._infer_results["AUC_ctr"] = auc_ctr
...@@ -100,7 +116,8 @@ class Model(ModelBase): ...@@ -100,7 +116,8 @@ class Model(ModelBase):
return return
loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) loss_ctcvr = fluid.layers.cross_entropy(
input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
...@@ -117,5 +134,8 @@ class Model(ModelBase): ...@@ -117,5 +134,8 @@ class Model(ModelBase):
def infer_net(self): def infer_net(self):
self._infer_data_var = self.input_data() self._infer_data_var = self.input_data()
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True) self.net(self._infer_data_var, is_infer=True)
...@@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader ...@@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
pass pass
def generate_sample(self, line): def generate_sample(self, line):
......
...@@ -24,6 +24,7 @@ class TrainReader(Reader): ...@@ -24,6 +24,7 @@ class TrainReader(Reader):
def generate_sample(self, line): def generate_sample(self, line):
""" """
Read the data line by line and process it as a dictionary Read the data line by line and process it as a dictionary
""" """
def reader(): def reader():
......
...@@ -23,44 +23,58 @@ class Model(ModelBase): ...@@ -23,44 +23,58 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def MMOE(self, is_infer=False): def MMOE(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size",
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace) None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None, self._namespace) expert_num = envs.get_global_env("hyper_parameters.expert_num", None,
gate_num = envs.get_global_env("hyper_parameters.gate_num", None, self._namespace) self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None, self._namespace) gate_num = envs.get_global_env("hyper_parameters.gate_num", None,
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace) self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None,
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") self._namespace)
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer: if is_infer:
self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital]) self._data_var.extend([input_data, label_income, label_marital])
# f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper # f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper
expert_outputs = [] expert_outputs = []
for i in range(0, expert_num): for i in range(0, expert_num):
expert_output = fluid.layers.fc(input=input_data, expert_output = fluid.layers.fc(
size=expert_size, input=input_data,
act='relu', size=expert_size,
bias_attr=fluid.ParamAttr(learning_rate=1.0), act='relu',
name='expert_' + str(i)) bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_outputs.append(expert_output) expert_outputs.append(expert_output)
expert_concat = fluid.layers.concat(expert_outputs, axis=1) expert_concat = fluid.layers.concat(expert_outputs, axis=1)
expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size]) expert_concat = fluid.layers.reshape(expert_concat,
[-1, expert_num, expert_size])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper # g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers = [] output_layers = []
for i in range(0, gate_num): for i in range(0, gate_num):
cur_gate = fluid.layers.fc(input=input_data, cur_gate = fluid.layers.fc(
size=expert_num, input=input_data,
act='softmax', size=expert_num,
bias_attr=fluid.ParamAttr(learning_rate=1.0), act='softmax',
name='gate_' + str(i)) bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
# f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x)) # f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x))
cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0) cur_gate_expert = fluid.layers.elementwise_mul(
expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1) cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1)
# Build tower layer # Build tower layer
cur_tower = fluid.layers.fc(input=cur_gate_expert, cur_tower = fluid.layers.fc(input=cur_gate_expert,
...@@ -74,25 +88,33 @@ class Model(ModelBase): ...@@ -74,25 +88,33 @@ class Model(ModelBase):
output_layers.append(out) output_layers.append(out)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_income = fluid.layers.clip(
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
label_income_1 = fluid.layers.slice(
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label_income, axes=[1], starts=[1], ends=[2])
label=fluid.layers.cast(x=label_income_1, label_marital_1 = fluid.layers.slice(
dtype='int64')) label_marital, axes=[1], starts=[1], ends=[2])
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1, auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
dtype='int64')) input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer: if is_infer:
self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital self._infer_results["AUC_marital"] = auc_marital
return return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) cost_income = fluid.layers.cross_entropy(
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
avg_cost_income = fluid.layers.mean(x=cost_income) avg_cost_income = fluid.layers.mean(x=cost_income)
avg_cost_marital = fluid.layers.mean(x=cost_marital) avg_cost_marital = fluid.layers.mean(x=cost_marital)
......
...@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm ...@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 | | Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 | | Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 | | Ali-CCP | ESMM | -- | 0.97181/0.49967 |
...@@ -24,27 +24,38 @@ class Model(ModelBase): ...@@ -24,27 +24,38 @@ class Model(ModelBase):
def model(self, is_infer=False): def model(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace) feature_size = envs.get_global_env("hyper_parameters.feature_size",
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None, self._namespace) None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace) bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None,
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None, self._namespace) self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") self._namespace)
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None,
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer: if is_infer:
self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital]) self._data_var.extend([input_data, label_income, label_marital])
bottom_output = fluid.layers.fc(input=input_data, bottom_output = fluid.layers.fc(
size=bottom_size, input=input_data,
act='relu', size=bottom_size,
bias_attr=fluid.ParamAttr(learning_rate=1.0), act='relu',
name='bottom_output') bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
# Build tower layer from bottom layer # Build tower layer from bottom layer
output_layers = [] output_layers = []
...@@ -59,26 +70,34 @@ class Model(ModelBase): ...@@ -59,26 +70,34 @@ class Model(ModelBase):
name='output_layer_' + str(index)) name='output_layer_' + str(index))
output_layers.append(output_layer) output_layers.append(output_layer)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_income = fluid.layers.clip(
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
label_income_1 = fluid.layers.slice(
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label_income, axes=[1], starts=[1], ends=[2])
label=fluid.layers.cast(x=label_income_1, label_marital_1 = fluid.layers.slice(
dtype='int64')) label_marital, axes=[1], starts=[1], ends=[2])
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1, auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
dtype='int64')) input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer: if is_infer:
self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital self._infer_results["AUC_marital"] = auc_marital
return return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) cost_income = fluid.layers.cross_entropy(
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1) cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import sys import sys
import io import io
......
...@@ -26,8 +26,8 @@ from collections import Counter ...@@ -26,8 +26,8 @@ from collections import Counter
import os import os
import paddle.fluid.incubate.data_generator as dg import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config): def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self) dg.MultiSlotDataGenerator.__init__(self)
...@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator):
if idx == 2 else math.log(1 + float(features[idx]))) if idx == 2 else math.log(1 + float(features[idx])))
for idx in self.cat_idx_: for idx in self.cat_idx_:
if features[idx] == '' or features[ if features[idx] == '' or features[
idx] not in self.cat_feat_idx_dict_list[idx - 14]: idx] not in self.cat_feat_idx_dict_list[idx - 14]:
label_feat_list[idx].append(0) label_feat_list[idx].append(0)
else: else:
label_feat_list[idx].append(self.cat_feat_idx_dict_list[ label_feat_list[idx].append(self.cat_feat_idx_dict_list[
idx - 14][features[idx]]) idx - 14][features[idx]])
label_feat_list[0].append(int(features[0])) label_feat_list[0].append(int(features[0]))
return label_feat_list return label_feat_list
...@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter return data_iter
reader = TrainReader("../config.yaml") reader = TrainReader("../config.yaml")
reader.init() reader.init()
reader.run_from_stdin() reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, absolute_import, division from __future__ import print_function, absolute_import, division
import os import os
......
...@@ -25,12 +25,18 @@ class Model(ModelBase): ...@@ -25,12 +25,18 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def init_network(self): def init_network(self):
self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace) self.cross_num = envs.get_global_env("hyper_parameters.cross_num",
self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace) None, self._namespace)
self.l2_reg_cross = envs.get_global_env("hyper_parameters.l2_reg_cross", None, self._namespace) self.dnn_hidden_units = envs.get_global_env(
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", None, self._namespace) "hyper_parameters.dnn_hidden_units", None, self._namespace)
self.clip_by_norm = envs.get_global_env("hyper_parameters.clip_by_norm", None, self._namespace) self.l2_reg_cross = envs.get_global_env(
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", None, self._namespace) "hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn",
None, self._namespace)
self.clip_by_norm = envs.get_global_env(
"hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num",
None, self._namespace)
self.sparse_inputs = self._sparse_data_var[1:] self.sparse_inputs = self._sparse_data_var[1:]
self.dense_inputs = self._dense_data_var self.dense_inputs = self._dense_data_var
...@@ -43,7 +49,8 @@ class Model(ModelBase): ...@@ -43,7 +49,8 @@ class Model(ModelBase):
cat_feat_dims_dict[spls[0]] = int(spls[1]) cat_feat_dims_dict[spls[0]] = int(spls[1])
self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict( self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict(
) )
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", None, self._namespace) self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
None, self._namespace)
self.dense_feat_names = [i.name for i in self.dense_inputs] self.dense_feat_names = [i.name for i in self.dense_inputs]
self.sparse_feat_names = [i.name for i in self.sparse_inputs] self.sparse_feat_names = [i.name for i in self.sparse_inputs]
...@@ -55,16 +62,19 @@ class Model(ModelBase): ...@@ -55,16 +62,19 @@ class Model(ModelBase):
self.net_input = None self.net_input = None
self.loss = None self.loss = None
def _create_embedding_input(self): def _create_embedding_input(self):
# sparse embedding # sparse embedding
sparse_emb_dict = OrderedDict() sparse_emb_dict = OrderedDict()
for var in self.sparse_inputs: for var in self.sparse_inputs:
sparse_emb_dict[var.name] = fluid.embedding(input=var, sparse_emb_dict[var.name] = fluid.embedding(
size=[self.feat_dims_dict[var.name] + 1, input=var,
6 * int(pow(self.feat_dims_dict[var.name], 0.25)) size=[
],is_sparse=self.is_sparse) self.feat_dims_dict[var.name] + 1,
6 * int(pow(self.feat_dims_dict[var.name], 0.25))
],
is_sparse=self.is_sparse)
# combine dense and sparse_emb # combine dense and sparse_emb
dense_input_list = self.dense_inputs dense_input_list = self.dense_inputs
sparse_emb_list = list(sparse_emb_dict.values()) sparse_emb_list = list(sparse_emb_dict.values())
...@@ -114,10 +124,11 @@ class Model(ModelBase): ...@@ -114,10 +124,11 @@ class Model(ModelBase):
def train_net(self): def train_net(self):
self.model._init_slots() self.model._init_slots()
self.init_network() self.init_network()
self.net_input = self._create_embedding_input() self.net_input = self._create_embedding_input()
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False) deep_out = self._deep_net(self.net_input, self.dnn_hidden_units,
self.dnn_use_bn, False)
cross_out, l2_reg_cross_loss = self._cross_net(self.net_input, cross_out, l2_reg_cross_loss = self._cross_net(self.net_input,
self.cross_num) self.cross_num)
...@@ -134,9 +145,11 @@ class Model(ModelBase): ...@@ -134,9 +145,11 @@ class Model(ModelBase):
input=prob_2d, label=label_int, slide_steps=0) input=prob_2d, label=label_int, slide_steps=0)
self._metrics["AUC"] = auc_var self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var self._metrics["BATCH_AUC"] = batch_auc_var
# logloss # logloss
logloss = fluid.layers.log_loss(self.prob, fluid.layers.cast(self.target_input, dtype='float32')) logloss = fluid.layers.log_loss(
self.prob, fluid.layers.cast(
self.target_input, dtype='float32'))
self.avg_logloss = fluid.layers.reduce_mean(logloss) self.avg_logloss = fluid.layers.reduce_mean(logloss)
# reg_coeff * l2_reg_cross # reg_coeff * l2_reg_cross
...@@ -145,7 +158,8 @@ class Model(ModelBase): ...@@ -145,7 +158,8 @@ class Model(ModelBase):
self._cost = self.loss self._cost = self.loss
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import shutil import shutil
import sys import sys
......
...@@ -19,8 +19,9 @@ try: ...@@ -19,8 +19,9 @@ try:
import cPickle as pickle import cPickle as pickle
except ImportError: except ImportError:
import pickle import pickle
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config): def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self) dg.MultiSlotDataGenerator.__init__(self)
...@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self.categorical_range_ = range(14, 40) self.categorical_range_ = range(14, 40)
# load preprocessed feature dict # load preprocessed feature dict
self.feat_dict_name = "aid_data/feat_dict_10.pkl2" self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line): def _process_line(self, line):
features = line.rstrip('\n').split('\t') features = line.rstrip('\n').split('\t')
...@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
def data_iter(): def data_iter():
feat_idx, feat_value, label = self._process_line(line) feat_idx, feat_value, label = self._process_line(line)
s = "" s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]: for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0] k = i[0]
v = i[1] v = i[1]
for j in v: for j in v:
s += " " + k + ":" + str(j) s += " " + k + ":" + str(j)
print s.strip() print s.strip()
yield None yield None
return data_iter return data_iter
reader = TrainReader("../config.yaml") reader = TrainReader("../config.yaml")
reader.init() reader.init()
reader.run_from_stdin() reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import numpy import numpy
from collections import Counter from collections import Counter
......
...@@ -27,21 +27,26 @@ class Model(ModelBase): ...@@ -27,21 +27,26 @@ class Model(ModelBase):
def deepfm_net(self): def deepfm_net(self):
init_value_ = 0.1 init_value_ = 0.1
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_number = envs.get_global_env(
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) "hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input -------------------------- # ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
raw_feat_idx = self._sparse_data_var[1] raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0] raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0] self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace)
reg = envs.get_global_env("hyper_parameters.reg", 1e-4,
self._namespace)
first_weights_re = fluid.embedding( first_weights_re = fluid.embedding(
input=feat_idx, input=feat_idx,
is_sparse=True, is_sparse=True,
...@@ -55,7 +60,8 @@ class Model(ModelBase): ...@@ -55,7 +60,8 @@ class Model(ModelBase):
regularizer=fluid.regularizer.L1DecayRegularizer(reg))) regularizer=fluid.regularizer.L1DecayRegularizer(reg)))
first_weights = fluid.layers.reshape( first_weights = fluid.layers.reshape(
first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1 first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1) y_first_order = fluid.layers.reduce_sum((first_weights * feat_value),
1)
# ------------------------- second order term -------------------------- # ------------------------- second order term --------------------------
...@@ -68,7 +74,8 @@ class Model(ModelBase): ...@@ -68,7 +74,8 @@ class Model(ModelBase):
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormalInitializer( initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_ / math.sqrt(float(sparse_feature_dim))))) loc=0.0,
scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
feat_embeddings = fluid.layers.reshape( feat_embeddings = fluid.layers.reshape(
feat_embeddings_re, feat_embeddings_re,
shape=[-1, num_field, shape=[-1, num_field,
...@@ -76,8 +83,8 @@ class Model(ModelBase): ...@@ -76,8 +83,8 @@ class Model(ModelBase):
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# sum_square part # sum_square part
summed_features_emb = fluid.layers.reduce_sum(feat_embeddings, summed_features_emb = fluid.layers.reduce_sum(
1) # None * embedding_size feat_embeddings, 1) # None * embedding_size
summed_features_emb_square = fluid.layers.square( summed_features_emb_square = fluid.layers.square(
summed_features_emb) # None * embedding_size summed_features_emb) # None * embedding_size
...@@ -88,13 +95,16 @@ class Model(ModelBase): ...@@ -88,13 +95,16 @@ class Model(ModelBase):
squared_features_emb, 1) # None * embedding_size squared_features_emb, 1) # None * embedding_size
y_second_order = 0.5 * fluid.layers.reduce_sum( y_second_order = 0.5 * fluid.layers.reduce_sum(
summed_features_emb_square - squared_sum_features_emb, 1, summed_features_emb_square - squared_sum_features_emb,
1,
keep_dim=True) # None * 1 keep_dim=True) # None * 1
# ------------------------- DNN -------------------------- # ------------------------- DNN --------------------------
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None,
act = envs.get_global_env("hyper_parameters.act", None, self._namespace) self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings, y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim]) [-1, num_field * sparse_feature_dim])
for s in layer_sizes: for s in layer_sizes:
...@@ -121,7 +131,8 @@ class Model(ModelBase): ...@@ -121,7 +131,8 @@ class Model(ModelBase):
# ------------------------- DeepFM -------------------------- # ------------------------- DeepFM --------------------------
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) self.predict = fluid.layers.sigmoid(y_first_order + y_second_order +
y_dnn)
def train_net(self): def train_net(self):
self.model._init_slots() self.model._init_slots()
...@@ -129,7 +140,8 @@ class Model(ModelBase): ...@@ -129,7 +140,8 @@ class Model(ModelBase):
# ------------------------- Cost(logloss) -------------------------- # ------------------------- Cost(logloss) --------------------------
cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32")) cost = fluid.layers.log_loss(
input=self.predict, label=fluid.layers.cast(self.label, "float32"))
avg_cost = fluid.layers.reduce_sum(cost) avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost self._cost = avg_cost
...@@ -145,7 +157,8 @@ class Model(ModelBase): ...@@ -145,7 +157,8 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function from __future__ import print_function
import random import random
import pickle import pickle
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function from __future__ import print_function
import pickle import pickle
import pandas as pd import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function from __future__ import print_function
import random import random
import pickle import pickle
......
...@@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase ...@@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase
class Model(ModelBase): class Model(ModelBase):
def __init__(self, config): def __init__(self, config):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def config_read(self, config_path): def config_read(self, config_path):
with open(config_path, "r") as fin: with open(config_path, "r") as fin:
user_count = int(fin.readline().strip()) user_count = int(fin.readline().strip())
item_count = int(fin.readline().strip()) item_count = int(fin.readline().strip())
cat_count = int(fin.readline().strip()) cat_count = int(fin.readline().strip())
return user_count, item_count, cat_count return user_count, item_count, cat_count
def din_attention(self, hist, target_expand, mask): def din_attention(self, hist, target_expand, mask):
"""activation weight""" """activation weight"""
...@@ -58,56 +58,66 @@ class Model(ModelBase): ...@@ -58,56 +58,66 @@ class Model(ModelBase):
out = fluid.layers.matmul(weight, hist) out = fluid.layers.matmul(weight, hist)
out = fluid.layers.reshape(x=out, shape=[0, hidden_size]) out = fluid.layers.reshape(x=out, shape=[0, hidden_size])
return out return out
def train_net(self): def train_net(self):
seq_len = -1 seq_len = -1
self.item_emb_size = envs.get_global_env("hyper_parameters.item_emb_size", 64, self._namespace) self.item_emb_size = envs.get_global_env(
self.cat_emb_size = envs.get_global_env("hyper_parameters.cat_emb_size", 64, self._namespace) "hyper_parameters.item_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", self._namespace) self.cat_emb_size = envs.get_global_env(
"hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid",
self._namespace)
#item_emb_size = 64 #item_emb_size = 64
#cat_emb_size = 64 #cat_emb_size = 64
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", False, self._namespace) self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False, self._namespace)
#significant for speeding up the training process #significant for speeding up the training process
self.config_path = envs.get_global_env("hyper_parameters.config_path", "data/config.txt", self._namespace) self.config_path = envs.get_global_env(
self.use_DataLoader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) "hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env(
"hyper_parameters.use_DataLoader", False, self._namespace)
user_count, item_count, cat_count = self.config_read(self.config_path) user_count, item_count, cat_count = self.config_read(self.config_path)
item_emb_attr = fluid.ParamAttr(name="item_emb") item_emb_attr = fluid.ParamAttr(name="item_emb")
cat_emb_attr = fluid.ParamAttr(name="cat_emb") cat_emb_attr = fluid.ParamAttr(name="cat_emb")
hist_item_seq = fluid.data( hist_item_seq = fluid.data(
name="hist_item_seq", shape=[None, seq_len], dtype="int64") name="hist_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_item_seq) self._data_var.append(hist_item_seq)
hist_cat_seq = fluid.data( hist_cat_seq = fluid.data(
name="hist_cat_seq", shape=[None, seq_len], dtype="int64") name="hist_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_cat_seq) self._data_var.append(hist_cat_seq)
target_item = fluid.data(name="target_item", shape=[None], dtype="int64") target_item = fluid.data(
name="target_item", shape=[None], dtype="int64")
self._data_var.append(target_item) self._data_var.append(target_item)
target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64") target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64")
self._data_var.append(target_cat) self._data_var.append(target_cat)
label = fluid.data(name="label", shape=[None, 1], dtype="float32") label = fluid.data(name="label", shape=[None, 1], dtype="float32")
self._data_var.append(label) self._data_var.append(label)
mask = fluid.data(name="mask", shape=[None, seq_len, 1], dtype="float32") mask = fluid.data(
name="mask", shape=[None, seq_len, 1], dtype="float32")
self._data_var.append(mask) self._data_var.append(mask)
target_item_seq = fluid.data( target_item_seq = fluid.data(
name="target_item_seq", shape=[None, seq_len], dtype="int64") name="target_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_item_seq) self._data_var.append(target_item_seq)
target_cat_seq = fluid.data( target_cat_seq = fluid.data(
name="target_cat_seq", shape=[None, seq_len], dtype="int64") name="target_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_cat_seq) self._data_var.append(target_cat_seq)
if self.use_DataLoader: if self.use_DataLoader:
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=10000, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=10000,
use_double_buffer=False,
iterable=False)
hist_item_emb = fluid.embedding( hist_item_emb = fluid.embedding(
input=hist_item_seq, input=hist_item_seq,
size=[item_count, self.item_emb_size], size=[item_count, self.item_emb_size],
...@@ -149,7 +159,8 @@ class Model(ModelBase): ...@@ -149,7 +159,8 @@ class Model(ModelBase):
size=[item_count, 1], size=[item_count, 1],
param_attr=fluid.initializer.Constant(value=0.0)) param_attr=fluid.initializer.Constant(value=0.0))
hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2) hist_seq_concat = fluid.layers.concat(
[hist_item_emb, hist_cat_emb], axis=2)
target_seq_concat = fluid.layers.concat( target_seq_concat = fluid.layers.concat(
[target_item_seq_emb, target_cat_seq_emb], axis=2) [target_item_seq_emb, target_cat_seq_emb], axis=2)
target_concat = fluid.layers.concat( target_concat = fluid.layers.concat(
...@@ -157,21 +168,22 @@ class Model(ModelBase): ...@@ -157,21 +168,22 @@ class Model(ModelBase):
out = self.din_attention(hist_seq_concat, target_seq_concat, mask) out = self.din_attention(hist_seq_concat, target_seq_concat, mask)
out_fc = fluid.layers.fc(name="out_fc", out_fc = fluid.layers.fc(name="out_fc",
input=out, input=out,
size=self.item_emb_size + self.cat_emb_size, size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1) num_flatten_dims=1)
embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1) embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1)
fc1 = fluid.layers.fc(name="fc1", fc1 = fluid.layers.fc(name="fc1",
input=embedding_concat, input=embedding_concat,
size=80, size=80,
act=self.act) act=self.act)
fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act) fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act)
fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1) fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1)
logit = fc3 + item_b logit = fc3 + item_b
loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label) loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logit, label=label)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
self._cost = avg_loss self._cost = avg_loss
...@@ -179,14 +191,14 @@ class Model(ModelBase): ...@@ -179,14 +191,14 @@ class Model(ModelBase):
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(label, 'int64') label_int = fluid.layers.cast(label, 'int64')
auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
label=label_int, label=label_int,
slide_steps=0) slide_steps=0)
self._metrics["AUC"] = auc_var self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
......
...@@ -29,13 +29,15 @@ from paddlerec.core.utils import envs ...@@ -29,13 +29,15 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader") self.train_data_path = envs.get_global_env("train_data_path", None,
"train.reader")
self.res = [] self.res = []
self.max_len = 0 self.max_len = 0
data_file_list = os.listdir(self.train_data_path) data_file_list = os.listdir(self.train_data_path)
for i in range(0, len(data_file_list)): for i in range(0, len(data_file_list)):
train_data_file = os.path.join(self.train_data_path, data_file_list[i]) train_data_file = os.path.join(self.train_data_path,
data_file_list[i])
with open(train_data_file, "r") as fin: with open(train_data_file, "r") as fin:
for line in fin: for line in fin:
line = line.strip().split(';') line = line.strip().split(';')
...@@ -78,11 +80,13 @@ class TrainReader(Reader): ...@@ -78,11 +80,13 @@ class TrainReader(Reader):
len_array = [len(x[0]) for x in b] len_array = [len(x[0]) for x in b]
mask = np.array( mask = np.array(
[[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape( [[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape(
[-1, max_len, 1]) [-1, max_len, 1])
target_item_seq = np.array( target_item_seq = np.array(
[[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) [[x[2]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
target_cat_seq = np.array( target_cat_seq = np.array(
[[x[3]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) [[x[3]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
res = [] res = []
for i in range(len(b)): for i in range(len(b)):
res.append([ res.append([
...@@ -127,4 +131,5 @@ class TrainReader(Reader): ...@@ -127,4 +131,5 @@ class TrainReader(Reader):
def generate_batch_from_trainfiles(self, files): def generate_batch_from_trainfiles(self, files):
data_set = self.base_read(files) data_set = self.base_read(files)
random.shuffle(data_set) random.shuffle(data_set)
return self.batch_reader(data_set, self.batch_size, self.batch_size * 20) return self.batch_reader(data_set, self.batch_size,
self.batch_size * 20)
...@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator): ...@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
""" """
Read the data line by line and process it as a dictionary Read the data line by line and process it as a dictionary
""" """
def reader(): def reader():
""" """
This function needs to be implemented by the user, based on data format This function needs to be implemented by the user, based on data format
...@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator): ...@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
feature_name.append("label") feature_name.append("label")
s = "click:" + str(label[0]) s = "click:" + str(label[0])
for i in dense_feature: for i in dense_feature:
s += " dense_feature:" + str(i) s += " dense_feature:" + str(i)
for i in range(1, 1 + len(categorical_range_)): for i in range(1, 1 + len(categorical_range_)):
s += " " + str(i) + ":" + str(sparse_feature[i-1][0]) s += " " + str(i) + ":" + str(sparse_feature[i - 1][0])
print s.strip() print s.strip()
yield None yield None
return reader return reader
......
...@@ -31,8 +31,10 @@ class Model(ModelBase): ...@@ -31,8 +31,10 @@ class Model(ModelBase):
def net(self): def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_number = envs.get_global_env(
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) "hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
def embedding_layer(input): def embedding_layer(input):
emb = fluid.layers.embedding( emb = fluid.layers.embedding(
...@@ -42,25 +44,27 @@ class Model(ModelBase): ...@@ -42,25 +44,27 @@ class Model(ModelBase):
size=[sparse_feature_number, sparse_feature_dim], size=[sparse_feature_number, sparse_feature_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="SparseFeatFactors", name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()), initializer=fluid.initializer.Uniform()), )
) emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
emb_sum = fluid.layers.sequence_pool(
input=emb, pool_type='sum')
return emb_sum return emb_sum
def fc(input, output_size): def fc(input, output_size):
output = fluid.layers.fc( output = fluid.layers.fc(
input=input, size=output_size, input=input,
act='relu', param_attr=fluid.ParamAttr( size=output_size,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal( initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(input.shape[1])))) scale=1.0 / math.sqrt(input.shape[1]))))
return output return output
sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs)) sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1) concated = fluid.layers.concat(
sparse_embed_seq + [self.dense_input], axis=1)
fcs = [concated] fcs = [concated]
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
for size in hidden_layers: for size in hidden_layers:
fcs.append(fc(fcs[-1], size)) fcs.append(fc(fcs[-1], size))
...@@ -75,14 +79,15 @@ class Model(ModelBase): ...@@ -75,14 +79,15 @@ class Model(ModelBase):
self.predict = predict self.predict = predict
def avg_loss(self): def avg_loss(self):
cost = fluid.layers.cross_entropy(input=self.predict, label=self.label_input) cost = fluid.layers.cross_entropy(
input=self.predict, label=self.label_input)
avg_cost = fluid.layers.reduce_mean(cost) avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost self._cost = avg_cost
def metrics(self): def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self.predict, auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
label=self.label_input, label=self.label_input,
num_thresholds=2 ** 12, num_thresholds=2**12,
slide_steps=20) slide_steps=20)
self._metrics["AUC"] = auc self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc self._metrics["BATCH_AUC"] = batch_auc
...@@ -95,7 +100,8 @@ class Model(ModelBase): ...@@ -95,7 +100,8 @@ class Model(ModelBase):
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import io import io
import args import args
import pandas as pd import pandas as pd
from sklearn import preprocessing from sklearn import preprocessing
def _clean_file(source_path,target_path): def _clean_file(source_path, target_path):
"""makes changes to match the CSV format.""" """makes changes to match the CSV format."""
with io.open(source_path, 'r') as temp_eval_file: with io.open(source_path, 'r') as temp_eval_file:
with io.open(target_path, 'w') as eval_file: with io.open(target_path, 'w') as eval_file:
...@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path): ...@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path):
line = line[:-1] line = line[:-1]
line += '\n' line += '\n'
eval_file.write(line) eval_file.write(line)
def build_model_columns(train_data_path, test_data_path): def build_model_columns(train_data_path, test_data_path):
# The column names are from # The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html # https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [ column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender', 'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket' 'income_bracket'
] ]
# Load the dataset in Pandas # Load the dataset in Pandas
...@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path): ...@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
# First group of tasks according to the paper # First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat'] #label_columns = ['income_50k', 'marital_stat']
categorical_columns = ['education','marital_status','relationship','workclass','occupation'] categorical_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation'
]
for col in categorical_columns: for col in categorical_columns:
label_train = preprocessing.LabelEncoder() label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col]) train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder() label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col]) test_df[col] = label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False) train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(),
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False) bins,
labels=False)
base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets'] test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(),
bins,
train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) labels=False)
test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) base_columns = [
test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str) 'education', 'marital_status', 'relationship', 'workclass',
crossed_columns = ['education_occupation','age_buckets_education_occupation'] 'occupation', 'age_buckets'
]
train_df['education_occupation'] = train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df[
'age_buckets'].astype(str) + '_' + train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df[
'age_buckets'].astype(str) + '_' + test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
crossed_columns = [
'education_occupation', 'age_buckets_education_occupation'
]
for col in crossed_columns: for col in crossed_columns:
label_train = preprocessing.LabelEncoder() label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col]) train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder() label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col]) test_df[col] = label_test.fit_transform(test_df[col])
wide_columns = base_columns + crossed_columns wide_columns = base_columns + crossed_columns
train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns) train_df_temp = pd.get_dummies(
test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns) train_df[categorical_columns], columns=categorical_columns)
test_df_temp = pd.get_dummies(
test_df[categorical_columns], columns=categorical_columns)
train_df = train_df.join(train_df_temp) train_df = train_df.join(train_df_temp)
test_df = test_df.join(test_df_temp) test_df = test_df.join(test_df_temp)
deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week'] deep_columns = list(train_df_temp.columns) + [
'age', 'education_num', 'capital_gain', 'capital_loss',
train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) 'hours_per_week'
test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) ]
with io.open('train_data/columns.txt','w') as f: train_df['label'] = train_df['income_bracket'].apply(
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' lambda x: 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str) f.write(write_str)
f.close() f.close()
with io.open('test_data/columns.txt','w') as f: with io.open('test_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str) f.write(write_str)
f.close() f.close()
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False) train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False) train_data_path, index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
test_data_path, index=False)
def clean_file(train_path, test_path, train_data_path, test_data_path): def clean_file(train_path, test_path, train_data_path, test_data_path):
_clean_file(train_path, train_data_path) _clean_file(train_path, train_data_path)
_clean_file(test_path, test_data_path) _clean_file(test_path, test_data_path)
if __name__ == '__main__': if __name__ == '__main__':
args = args.parse_args() args = args.parse_args()
clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path) clean_file(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
build_model_columns(args.train_data_path, args.test_data_path) build_model_columns(args.train_data_path, args.test_data_path)
...@@ -20,6 +20,7 @@ except ImportError: ...@@ -20,6 +20,7 @@ except ImportError:
import pickle import pickle
import paddle.fluid.incubate.data_generator as dg import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator): class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config): def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self) dg.MultiSlotDataGenerator.__init__(self)
...@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
wide_feat, deep_deat, label = self._process_line(line) wide_feat, deep_deat, label = self._process_line(line)
s = "" s = ""
for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]: for i in [('wide_input', wide_feat), ('deep_input', deep_deat),
('label', label)]:
k = i[0] k = i[0]
v = i[1] v = i[1]
for j in v: for j in v:
...@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter return data_iter
reader = TrainReader("../config.yaml") reader = TrainReader("../config.yaml")
reader.init() reader.init()
reader.run_from_stdin() reader.run_from_stdin()
...@@ -25,27 +25,27 @@ class Model(ModelBase): ...@@ -25,27 +25,27 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def wide_part(self, data): def wide_part(self, data):
out = fluid.layers.fc(input=data, out = fluid.layers.fc(
size=1, input=data,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, size=1,
scale=1.0 / math.sqrt( param_attr=fluid.ParamAttr(
data.shape[ initializer=fluid.initializer.TruncatedNormal(
1])), loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer( regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)), regularization_coeff=1e-4)),
act=None, act=None,
name='wide') name='wide')
return out return out
def fc(self, data, hidden_units, active, tag): def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data, output = fluid.layers.fc(
size=hidden_units, input=data,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, size=hidden_units,
scale=1.0 / math.sqrt( param_attr=fluid.ParamAttr(
data.shape[ initializer=fluid.initializer.TruncatedNormal(
1]))), loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
act=active, act=active,
name=tag) name=tag)
return output return output
...@@ -62,43 +62,63 @@ class Model(ModelBase): ...@@ -62,43 +62,63 @@ class Model(ModelBase):
deep_input = self._dense_data_var[1] deep_input = self._dense_data_var[1]
label = self._sparse_data_var[0] label = self._sparse_data_var[0]
hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units", 75, self._namespace) hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units",
hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units", 50, self._namespace) 75, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", 25, self._namespace) hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units",
50, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units",
25, self._namespace)
wide_output = self.wide_part(wide_input) wide_output = self.wide_part(wide_input)
deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, hidden3_units) deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units,
hidden3_units)
wide_model = fluid.layers.fc(input=wide_output,
size=1, wide_model = fluid.layers.fc(
param_attr=fluid.ParamAttr( input=wide_output,
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), size=1,
act=None, param_attr=fluid.ParamAttr(
name='w_wide') initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
deep_model = fluid.layers.fc(input=deep_output, act=None,
size=1, name='w_wide')
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), deep_model = fluid.layers.fc(
act=None, input=deep_output,
name='w_deep') size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
act=None,
name='w_deep')
prediction = fluid.layers.elementwise_add(wide_model, deep_model) prediction = fluid.layers.elementwise_add(wide_model, deep_model)
pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction") pred = fluid.layers.sigmoid(
fluid.layers.clip(
prediction, min=-15.0, max=15.0),
name="prediction")
num_seqs = fluid.layers.create_tensor(dtype='int64') num_seqs = fluid.layers.create_tensor(dtype='int64')
acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=label, dtype='int64'), total=num_seqs) acc = fluid.layers.accuracy(
auc_var, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=label, dtype='int64')) input=pred,
label=fluid.layers.cast(
x=label, dtype='int64'),
total=num_seqs)
auc_var, batch_auc, auc_states = fluid.layers.auc(
input=pred, label=fluid.layers.cast(
x=label, dtype='int64'))
self._metrics["AUC"] = auc_var self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc self._metrics["BATCH_AUC"] = batch_auc
self._metrics["ACC"] = acc self._metrics["ACC"] = acc
cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=fluid.layers.cast(label, dtype='float32')) cost = fluid.layers.sigmoid_cross_entropy_with_logits(
x=prediction, label=fluid.layers.cast(
label, dtype='float32'))
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost self._cost = avg_cost
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import shutil import shutil
import sys import sys
......
...@@ -21,6 +21,7 @@ except ImportError: ...@@ -21,6 +21,7 @@ except ImportError:
import pickle import pickle
import paddle.fluid.incubate.data_generator as dg import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator): class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config): def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self) dg.MultiSlotDataGenerator.__init__(self)
...@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
feat_idx, feat_value, label = self._process_line(line) feat_idx, feat_value, label = self._process_line(line)
s = "" s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]: for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0] k = i[0]
v = i[1] v = i[1]
for j in v: for j in v:
...@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator): ...@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter return data_iter
reader = TrainReader("../config.yaml") reader = TrainReader("../config.yaml")
reader.init() reader.init()
reader.run_from_stdin() reader.run_from_stdin()
...@@ -28,18 +28,22 @@ class Model(ModelBase): ...@@ -28,18 +28,22 @@ class Model(ModelBase):
loc=0.0, scale=init_value_) loc=0.0, scale=init_value_)
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_number = envs.get_global_env(
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) "hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input -------------------------- # ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
raw_feat_idx = self._sparse_data_var[1] raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0] raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0] self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
feat_embeddings = fluid.embedding( feat_embeddings = fluid.embedding(
input=feat_idx, input=feat_idx,
...@@ -48,9 +52,9 @@ class Model(ModelBase): ...@@ -48,9 +52,9 @@ class Model(ModelBase):
size=[sparse_feature_number + 1, sparse_feature_dim], size=[sparse_feature_number + 1, sparse_feature_dim],
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr(initializer=initer)) param_attr=fluid.ParamAttr(initializer=initer))
feat_embeddings = fluid.layers.reshape( feat_embeddings = fluid.layers.reshape(feat_embeddings, [
feat_embeddings, -1, num_field, sparse_feature_dim
[-1, num_field, sparse_feature_dim]) # None * num_field * embedding_size ]) # None * num_field * embedding_size
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# -------------------- linear -------------------- # -------------------- linear --------------------
...@@ -73,7 +77,8 @@ class Model(ModelBase): ...@@ -73,7 +77,8 @@ class Model(ModelBase):
# -------------------- CIN -------------------- # -------------------- CIN --------------------
layer_sizes_cin = envs.get_global_env("hyper_parameters.layer_sizes_cin", None, self._namespace) layer_sizes_cin = envs.get_global_env(
"hyper_parameters.layer_sizes_cin", None, self._namespace)
Xs = [feat_embeddings] Xs = [feat_embeddings]
last_s = num_field last_s = num_field
for s in layer_sizes_cin: for s in layer_sizes_cin:
...@@ -84,7 +89,8 @@ class Model(ModelBase): ...@@ -84,7 +89,8 @@ class Model(ModelBase):
1]) # None, embedding_size, num_field, 1 1]) # None, embedding_size, num_field, 1
X_k = fluid.layers.reshape( X_k = fluid.layers.reshape(
fluid.layers.transpose(Xs[-1], [0, 2, 1]), fluid.layers.transpose(Xs[-1], [0, 2, 1]),
[-1, sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s [-1, sparse_feature_dim, 1,
last_s]) # None, embedding_size, 1, last_s
Z_k_1 = fluid.layers.matmul( Z_k_1 = fluid.layers.matmul(
X_0, X_k) # None, embedding_size, num_field, last_s X_0, X_k) # None, embedding_size, num_field, last_s
...@@ -124,16 +130,19 @@ class Model(ModelBase): ...@@ -124,16 +130,19 @@ class Model(ModelBase):
# -------------------- DNN -------------------- # -------------------- DNN --------------------
layer_sizes_dnn = envs.get_global_env("hyper_parameters.layer_sizes_dnn", None, self._namespace) layer_sizes_dnn = envs.get_global_env(
act = envs.get_global_env("hyper_parameters.act", None, self._namespace) "hyper_parameters.layer_sizes_dnn", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings, y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim]) [-1, num_field * sparse_feature_dim])
for s in layer_sizes_dnn: for s in layer_sizes_dnn:
y_dnn = fluid.layers.fc(input=y_dnn, y_dnn = fluid.layers.fc(
size=s, input=y_dnn,
act=act, size=s,
param_attr=fluid.ParamAttr(initializer=initer), act=act,
bias_attr=None) param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(input=y_dnn, y_dnn = fluid.layers.fc(input=y_dnn,
size=1, size=1,
act=None, act=None,
...@@ -148,7 +157,10 @@ class Model(ModelBase): ...@@ -148,7 +157,10 @@ class Model(ModelBase):
self.model._init_slots() self.model._init_slots()
self.xdeepfm_net() self.xdeepfm_net()
cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32"), epsilon=0.0000001) cost = fluid.layers.log_loss(
input=self.predict,
label=fluid.layers.cast(self.label, "float32"),
epsilon=0.0000001)
batch_cost = fluid.layers.reduce_mean(cost) batch_cost = fluid.layers.reduce_mean(cost)
self._cost = batch_cost self._cost = batch_cost
...@@ -162,7 +174,8 @@ class Model(ModelBase): ...@@ -162,7 +174,8 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data ...@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir test_data mkdir test_data
mv diginetica/test.txt test_data mv diginetica/test.txt test_data
...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs ...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") self.batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.input = [] self.input = []
self.length = None self.length = None
...@@ -34,7 +35,8 @@ class EvaluateReader(Reader): ...@@ -34,7 +35,8 @@ class EvaluateReader(Reader):
with open(f, "r") as fin: with open(f, "r") as fin:
for line in fin: for line in fin:
line = line.strip().split('\t') line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])])) res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res return res
def make_data(self, cur_batch, batch_size): def make_data(self, cur_batch, batch_size):
...@@ -75,10 +77,8 @@ class EvaluateReader(Reader): ...@@ -75,10 +77,8 @@ class EvaluateReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1 u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose()) adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append( seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
[[id, np.where(node == i)[0][0]] for i in e[0]]) last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1) label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] * mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)]) (max_seq_len - last_id[id] - 1)])
...@@ -101,10 +101,13 @@ class EvaluateReader(Reader): ...@@ -101,10 +101,13 @@ class EvaluateReader(Reader):
def _reader(): def _reader():
random.shuffle(self.input) random.shuffle(self.input)
group_remain = self.length % batch_group_size group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size): for bg_id in range(0, self.length - group_remain,
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size]) batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train: if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True) cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size): for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size] cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size) yield self.make_data(cur_batch, batch_size)
......
...@@ -30,15 +30,21 @@ class Model(ModelBase): ...@@ -30,15 +30,21 @@ class Model(ModelBase):
def init_config(self): def init_config(self):
self._fetch_interval = 1 self._fetch_interval = 1
self.items_num, self.ins_num = self.config_read( self.items_num, self.ins_num = self.config_read(
envs.get_global_env("hyper_parameters.config_path", None, self._namespace)) envs.get_global_env("hyper_parameters.config_path", None,
self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader") self._namespace))
self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") self.train_batch_size = envs.get_global_env("batch_size", None,
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) "train.reader")
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace) self.evaluate_batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.hidden_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps", None, self._namespace)
def config_read(self, config_path=None): def config_read(self, config_path=None):
if config_path is None: if config_path is None:
raise ValueError("please set train.model.hyper_parameters.config_path at first") raise ValueError(
"please set train.model.hyper_parameters.config_path at first")
with open(config_path, "r") as fin: with open(config_path, "r") as fin:
item_nums = int(fin.readline().strip()) item_nums = int(fin.readline().strip())
ins_nums = int(fin.readline().strip()) ins_nums = int(fin.readline().strip())
...@@ -46,100 +52,108 @@ class Model(ModelBase): ...@@ -46,100 +52,108 @@ class Model(ModelBase):
def input(self, bs): def input(self, bs):
self.items = fluid.data( self.items = fluid.data(
name="items", name="items", shape=[bs, -1],
shape=[bs, -1],
dtype="int64") # [batch_size, uniq_max] dtype="int64") # [batch_size, uniq_max]
self.seq_index = fluid.data( self.seq_index = fluid.data(
name="seq_index", name="seq_index", shape=[bs, -1, 2],
shape=[bs, -1, 2],
dtype="int32") # [batch_size, seq_max, 2] dtype="int32") # [batch_size, seq_max, 2]
self.last_index = fluid.data( self.last_index = fluid.data(
name="last_index", name="last_index", shape=[bs, 2], dtype="int32") # [batch_size, 2]
shape=[bs, 2],
dtype="int32") # [batch_size, 2]
self.adj_in = fluid.data( self.adj_in = fluid.data(
name="adj_in", name="adj_in", shape=[bs, -1, -1],
shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max] dtype="float32") # [batch_size, seq_max, seq_max]
self.adj_out = fluid.data( self.adj_out = fluid.data(
name="adj_out", name="adj_out", shape=[bs, -1, -1],
shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max] dtype="float32") # [batch_size, seq_max, seq_max]
self.mask = fluid.data( self.mask = fluid.data(
name="mask", name="mask", shape=[bs, -1, 1],
shape=[bs, -1, 1],
dtype="float32") # [batch_size, seq_max, 1] dtype="float32") # [batch_size, seq_max, 1]
self.label = fluid.data( self.label = fluid.data(
name="label", name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
shape=[bs, 1],
dtype="int64") # [batch_size, 1]
res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label] res = [
self.items, self.seq_index, self.last_index, self.adj_in,
self.adj_out, self.mask, self.label
]
return res return res
def train_input(self): def train_input(self):
res = self.input(self.train_batch_size) res = self.input(self.train_batch_size)
self._data_var = res self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader: if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def net(self, items_num, hidden_size, step, bs): def net(self, items_num, hidden_size, step, bs):
stdv = 1.0 / math.sqrt(hidden_size) stdv = 1.0 / math.sqrt(hidden_size)
def embedding_layer(input, table_name, emb_dim, initializer_instance=None): def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None):
emb = fluid.embedding( emb = fluid.embedding(
input=input, input=input,
size=[items_num, emb_dim], size=[items_num, emb_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=table_name, name=table_name, initializer=initializer_instance), )
initializer=initializer_instance),
)
return emb return emb
sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer) items_emb = embedding_layer(self.items, "emb", hidden_size,
sparse_initializer)
pre_state = items_emb pre_state = items_emb
for i in range(step): for i in range(step):
pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) pre_state = layers.reshape(
x=pre_state, shape=[bs, -1, hidden_size])
state_in = layers.fc( state_in = layers.fc(
input=pre_state, input=pre_state,
name="state_in", name="state_in",
size=hidden_size, size=hidden_size,
act=None, act=None,
num_flatten_dims=2, num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( param_attr=fluid.ParamAttr(
low=-stdv, high=stdv)), initializer=fluid.initializer.Uniform(
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)),
low=-stdv, high=stdv))) # [batch_size, uniq_max, h] bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_out = layers.fc( state_out = layers.fc(
input=pre_state, input=pre_state,
name="state_out", name="state_out",
size=hidden_size, size=hidden_size,
act=None, act=None,
num_flatten_dims=2, num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( param_attr=fluid.ParamAttr(
low=-stdv, high=stdv)), initializer=fluid.initializer.Uniform(
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)),
low=-stdv, high=stdv))) # [batch_size, uniq_max, h] bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h] state_adj_in = layers.matmul(self.adj_in,
state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h] state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(
self.adj_out, state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_input = layers.reshape(
gru_fc = layers.fc( x=gru_input, shape=[-1, hidden_size * 2])
input=gru_input, gru_fc = layers.fc(input=gru_input,
name="gru_fc", name="gru_fc",
size=3 * hidden_size, size=3 * hidden_size,
bias_attr=False) bias_attr=False)
pre_state, _, _ = fluid.layers.gru_unit( pre_state, _, _ = fluid.layers.gru_unit(
input=gru_fc, input=gru_fc,
hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]), hidden=layers.reshape(
x=pre_state, shape=[-1, hidden_size]),
size=3 * hidden_size) size=3 * hidden_size)
final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
...@@ -153,24 +167,22 @@ class Model(ModelBase): ...@@ -153,24 +167,22 @@ class Model(ModelBase):
bias_attr=False, bias_attr=False,
act=None, act=None,
num_flatten_dims=2, num_flatten_dims=2,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, h]
low=-stdv, high=stdv))) # [batch_size, seq_max, h] last_fc = layers.fc(input=last,
last_fc = layers.fc( name="last_fc",
input=last, size=hidden_size,
name="last_fc", bias_attr=False,
size=hidden_size, act=None,
bias_attr=False, num_flatten_dims=1,
act=None, param_attr=fluid.ParamAttr(
num_flatten_dims=1, initializer=fluid.initializer.Uniform(
param_attr=fluid.ParamAttr( low=-stdv, high=stdv))) # [bathc_size, h]
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [bathc_size, h]
seq_fc_t = layers.transpose( seq_fc_t = layers.transpose(
seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h] seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h]
add = layers.elementwise_add( add = layers.elementwise_add(seq_fc_t,
seq_fc_t, last_fc) # [seq_max, batch_size, h] last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter( b = layers.create_parameter(
shape=[hidden_size], shape=[hidden_size],
dtype='float32', dtype='float32',
...@@ -188,12 +200,13 @@ class Model(ModelBase): ...@@ -188,12 +200,13 @@ class Model(ModelBase):
act=None, act=None,
num_flatten_dims=2, num_flatten_dims=2,
bias_attr=False, bias_attr=False,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
weight *= self.mask weight *= self.mask
weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h] weight_mask = layers.elementwise_mul(
global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h] seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(
weight_mask, dim=1) # [batch_size, h]
final_attention = layers.concat( final_attention = layers.concat(
[global_attention, last], axis=1) # [batch_size, 2*h] [global_attention, last], axis=1) # [batch_size, 2*h]
...@@ -213,7 +226,8 @@ class Model(ModelBase): ...@@ -213,7 +226,8 @@ class Model(ModelBase):
# persistable=True, # persistable=True,
# name="all_vocab") # name="all_vocab")
all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32') all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64') all_vocab = fluid.layers.cast(
x=fluid.layers.assign(all_vocab), dtype='int64')
all_emb = fluid.embedding( all_emb = fluid.embedding(
input=all_vocab, input=all_vocab,
...@@ -240,15 +254,19 @@ class Model(ModelBase): ...@@ -240,15 +254,19 @@ class Model(ModelBase):
def train_net(self): def train_net(self):
self.train_input() self.train_input()
self.net(self.items_num, self.hidden_size, self.step, self.train_batch_size) self.net(self.items_num, self.hidden_size, self.step,
self.train_batch_size)
self.avg_loss() self.avg_loss()
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
step_per_epoch = self.ins_num // self.train_batch_size step_per_epoch = self.ins_num // self.train_batch_size
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, self._namespace) decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, self._namespace) self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace) l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace)
optimizer = fluid.optimizer.Adam( optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
...@@ -266,10 +284,14 @@ class Model(ModelBase): ...@@ -266,10 +284,14 @@ class Model(ModelBase):
self._infer_data_var = res self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self): def infer_net(self):
self.infer_input() self.infer_input()
self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size) self.net(self.items_num, self.hidden_size, self.step,
self.evaluate_batch_size)
self._infer_results['acc'] = self.acc self._infer_results['acc'] = self.acc
self._infer_results['loss'] = self.loss self._infer_results['loss'] = self.loss
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import time import time
import pickle import pickle
...@@ -10,6 +24,7 @@ parser.add_argument( ...@@ -10,6 +24,7 @@ parser.add_argument(
help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample') help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample')
opt = parser.parse_args() opt = parser.parse_args()
def process_data(file_type): def process_data(file_type):
path = os.path.join(opt.data_dir, file_type) path = os.path.join(opt.data_dir, file_type)
output_path = os.path.splitext(path)[0] + ".txt" output_path = os.path.splitext(path)[0] + ".txt"
...@@ -23,6 +38,7 @@ def process_data(file_type): ...@@ -23,6 +38,7 @@ def process_data(file_type):
fout.write(str(data[i][1])) fout.write(str(data[i][1]))
fout.write("\n") fout.write("\n")
process_data("train") process_data("train")
process_data("test") process_data("test")
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests import requests
import sys import sys
import time import time
......
...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs ...@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "train.reader") self.batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.input = [] self.input = []
self.length = None self.length = None
...@@ -34,7 +35,8 @@ class TrainReader(Reader): ...@@ -34,7 +35,8 @@ class TrainReader(Reader):
with open(f, "r") as fin: with open(f, "r") as fin:
for line in fin: for line in fin:
line = line.strip().split('\t') line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])])) res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res return res
def make_data(self, cur_batch, batch_size): def make_data(self, cur_batch, batch_size):
...@@ -75,10 +77,8 @@ class TrainReader(Reader): ...@@ -75,10 +77,8 @@ class TrainReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1 u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose()) adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append( seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
[[id, np.where(node == i)[0][0]] for i in e[0]]) last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1) label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] * mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)]) (max_seq_len - last_id[id] - 1)])
...@@ -101,10 +101,13 @@ class TrainReader(Reader): ...@@ -101,10 +101,13 @@ class TrainReader(Reader):
def _reader(): def _reader():
random.shuffle(self.input) random.shuffle(self.input)
group_remain = self.length % batch_group_size group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size): for bg_id in range(0, self.length - group_remain,
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size]) batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train: if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True) cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size): for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size] cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size) yield self.make_data(cur_batch, batch_size)
......
...@@ -24,14 +24,22 @@ class Model(ModelBase): ...@@ -24,14 +24,22 @@ class Model(ModelBase):
def all_vocab_network(self, is_infer=False): def all_vocab_network(self, is_infer=False):
""" network definition """ """ network definition """
recall_k = envs.get_global_env("hyper_parameters.recall_k", None, self._namespace) recall_k = envs.get_global_env("hyper_parameters.recall_k", None,
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) self._namespace)
hid_size = envs.get_global_env("hyper_parameters.hid_size", None, self._namespace) vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound", None, self._namespace) self._namespace)
init_high_bound = envs.get_global_env("hyper_parameters.init_high_bound", None, self._namespace) hid_size = envs.get_global_env("hyper_parameters.hid_size", None,
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None, self._namespace) self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None, self._namespace) init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound",
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None, self._namespace) None, self._namespace)
init_high_bound = envs.get_global_env(
"hyper_parameters.init_high_bound", None, self._namespace)
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None,
self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None,
self._namespace)
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None,
self._namespace)
# Input data # Input data
src_wordseq = fluid.data( src_wordseq = fluid.data(
name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1) name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1)
...@@ -41,7 +49,10 @@ class Model(ModelBase): ...@@ -41,7 +49,10 @@ class Model(ModelBase):
if is_infer: if is_infer:
self._infer_data_var = [src_wordseq, dst_wordseq] self._infer_data_var = [src_wordseq, dst_wordseq]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
emb = fluid.embedding( emb = fluid.embedding(
input=src_wordseq, input=src_wordseq,
...@@ -56,7 +67,8 @@ class Model(ModelBase): ...@@ -56,7 +67,8 @@ class Model(ModelBase):
size=hid_size * 3, size=hid_size * 3,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform( initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound), low=init_low_bound,
high=init_high_bound),
learning_rate=gru_lr_x)) learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru( gru_h0 = fluid.layers.dynamic_gru(
input=fc0, input=fc0,
......
...@@ -25,9 +25,12 @@ class Model(ModelBase): ...@@ -25,9 +25,12 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def input_data(self, is_infer=False): def input_data(self, is_infer=False):
user_input = fluid.data(name="user_input", shape=[-1, 1], dtype="int64", lod_level=0) user_input = fluid.data(
item_input = fluid.data(name="item_input", shape=[-1, 1], dtype="int64", lod_level=0) name="user_input", shape=[-1, 1], dtype="int64", lod_level=0)
label = fluid.data(name="label", shape=[-1, 1], dtype="int64", lod_level=0) item_input = fluid.data(
name="item_input", shape=[-1, 1], dtype="int64", lod_level=0)
label = fluid.data(
name="label", shape=[-1, 1], dtype="int64", lod_level=0)
if is_infer: if is_infer:
inputs = [user_input] + [item_input] inputs = [user_input] + [item_input]
else: else:
...@@ -35,81 +38,104 @@ class Model(ModelBase): ...@@ -35,81 +38,104 @@ class Model(ModelBase):
self._data_var = inputs self._data_var = inputs
return inputs return inputs
def net(self, inputs, is_infer=False): def net(self, inputs, is_infer=False):
num_users = envs.get_global_env("hyper_parameters.num_users", None, self._namespace) num_users = envs.get_global_env("hyper_parameters.num_users", None,
num_items = envs.get_global_env("hyper_parameters.num_items", None, self._namespace) self._namespace)
latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None, self._namespace) num_items = envs.get_global_env("hyper_parameters.num_items", None,
layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace) self._namespace)
latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None,
num_layer = len(layers) #Number of layers in the MLP self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None,
MF_Embedding_User = fluid.embedding(input=inputs[0], self._namespace)
size=[num_users, latent_dim],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), num_layer = len(layers) #Number of layers in the MLP
is_sparse=True)
MF_Embedding_Item = fluid.embedding(input=inputs[1], MF_Embedding_User = fluid.embedding(
size=[num_items, latent_dim], input=inputs[0],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), size=[num_users, latent_dim],
is_sparse=True) param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
MLP_Embedding_User = fluid.embedding(input=inputs[0], is_sparse=True)
size=[num_users, int(layers[0] / 2)], MF_Embedding_Item = fluid.embedding(
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), input=inputs[1],
is_sparse=True) size=[num_items, latent_dim],
MLP_Embedding_Item = fluid.embedding(input=inputs[1], param_attr=fluid.initializer.Normal(
size=[num_items, int(layers[0] / 2)], loc=0.0, scale=0.01),
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), is_sparse=True)
is_sparse=True)
MLP_Embedding_User = fluid.embedding(
input=inputs[0],
size=[num_users, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_Item = fluid.embedding(
input=inputs[1],
size=[num_items, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
# MF part # MF part
mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1) mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1)
mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1) mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1)
mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent) mf_vector = fluid.layers.elementwise_mul(mf_user_latent,
mf_item_latent)
# MLP part # MLP part
# The 0-th layer is the concatenation of embedding layers # The 0-th layer is the concatenation of embedding layers
mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1) mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1)
mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1) mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1)
mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1) mlp_vector = fluid.layers.concat(
input=[mlp_user_latent, mlp_item_latent], axis=-1)
for i in range(1, num_layer): for i in range(1, num_layer):
mlp_vector = fluid.layers.fc(input=mlp_vector, mlp_vector = fluid.layers.fc(
size=layers[i], input=mlp_vector,
act='relu', size=layers[i],
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])), act='relu',
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), param_attr=fluid.ParamAttr(
name='layer_' + str(i)) initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
name='layer_' + str(i))
# Concatenate MF and MLP parts # Concatenate MF and MLP parts
predict_vector = fluid.layers.concat(input=[mf_vector, mlp_vector], axis=-1) predict_vector = fluid.layers.concat(
input=[mf_vector, mlp_vector], axis=-1)
# Final prediction layer # Final prediction layer
prediction = fluid.layers.fc(input=predict_vector, prediction = fluid.layers.fc(
size=1, input=predict_vector,
act='sigmoid', size=1,
param_attr=fluid.initializer.MSRAInitializer(uniform=True), act='sigmoid',
name='prediction') param_attr=fluid.initializer.MSRAInitializer(uniform=True),
name='prediction')
if is_infer: if is_infer:
self._infer_results["prediction"] = prediction self._infer_results["prediction"] = prediction
return return
cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32')) cost = fluid.layers.log_loss(
input=prediction,
label=fluid.layers.cast(
x=inputs[2], dtype='float32'))
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost self._cost = avg_cost
self._metrics["cost"] = avg_cost self._metrics["cost"] = avg_cost
def train_net(self): def train_net(self):
input_data = self.input_data() input_data = self.input_data()
self.net(input_data) self.net(input_data)
def infer_net(self): def infer_net(self):
self._infer_data_var = self.input_data(is_infer=True) self._infer_data_var = self.input_data(is_infer=True)
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True) self.net(self._infer_data_var, is_infer=True)
...@@ -33,7 +33,9 @@ class EvaluateReader(Reader): ...@@ -33,7 +33,9 @@ class EvaluateReader(Reader):
This function needs to be implemented by the user, based on data format This function needs to be implemented by the user, based on data format
""" """
features = line.strip().split(',') features = line.strip().split(',')
feature_name = ["user_input", "item_input"] feature_name = ["user_input", "item_input"]
yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]]) yield zip(feature_name,
[[int(features[0])]] + [[int(features[1])]])
return reader return reader
...@@ -33,10 +33,9 @@ class TrainReader(Reader): ...@@ -33,10 +33,9 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format This function needs to be implemented by the user, based on data format
""" """
features = line.strip().split(',') features = line.strip().split(',')
feature_name = ["user_input", "item_input", "label"] feature_name = ["user_input", "item_input", "label"]
yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]] + [[int(features[2])]]) yield zip(feature_name, [[int(features[0])]] +
[[int(features[1])]] + [[int(features[2])]])
return reader return reader
...@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn ...@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
| MOVIELENS | NCF | 0.688 | -- | | MOVIELENS | NCF | 0.688 | -- |
| -- | Youtube | -- | -- | | -- | Youtube | -- | -- |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 | | 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
...@@ -79,9 +79,12 @@ class Model(ModelBase): ...@@ -79,9 +79,12 @@ class Model(ModelBase):
return correct return correct
def train(self): def train(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None,
self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None,
self._namespace)
emb_shape = [vocab_size, emb_dim] emb_shape = [vocab_size, emb_dim]
self.user_encoder = GrnnEncoder() self.user_encoder = GrnnEncoder()
...@@ -131,24 +134,34 @@ class Model(ModelBase): ...@@ -131,24 +134,34 @@ class Model(ModelBase):
self.train() self.train()
def infer(self): def infer(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None,
self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None,
self._namespace)
user_data = fluid.data( user_data = fluid.data(
name="user", shape=[None, 1], dtype="int64", lod_level=1) name="user", shape=[None, 1], dtype="int64", lod_level=1)
all_item_data = fluid.data( all_item_data = fluid.data(
name="all_item", shape=[None, vocab_size], dtype="int64") name="all_item", shape=[None, vocab_size], dtype="int64")
pos_label = fluid.data(name="pos_label", shape=[None, 1], dtype="int64") pos_label = fluid.data(
name="pos_label", shape=[None, 1], dtype="int64")
self._infer_data_var = [user_data, all_item_data, pos_label] self._infer_data_var = [user_data, all_item_data, pos_label]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
user_emb = fluid.embedding( user_emb = fluid.embedding(
input=user_data, size=[vocab_size, emb_dim], param_attr="emb.item") input=user_data, size=[vocab_size, emb_dim], param_attr="emb.item")
all_item_emb = fluid.embedding( all_item_emb = fluid.embedding(
input=all_item_data, size=[vocab_size, emb_dim], param_attr="emb.item") input=all_item_data,
all_item_emb_re = fluid.layers.reshape(x=all_item_emb, shape=[-1, emb_dim]) size=[vocab_size, emb_dim],
param_attr="emb.item")
all_item_emb_re = fluid.layers.reshape(
x=all_item_emb, shape=[-1, emb_dim])
user_encoder = GrnnEncoder() user_encoder = GrnnEncoder()
user_enc = user_encoder.forward(user_emb) user_enc = user_encoder.forward(user_emb)
...@@ -156,7 +169,8 @@ class Model(ModelBase): ...@@ -156,7 +169,8 @@ class Model(ModelBase):
size=hidden_size, size=hidden_size,
param_attr='user.w', param_attr='user.w',
bias_attr="user.b") bias_attr="user.b")
user_exp = fluid.layers.expand(x=user_hid, expand_times=[1, vocab_size]) user_exp = fluid.layers.expand(
x=user_hid, expand_times=[1, vocab_size])
user_re = fluid.layers.reshape(x=user_exp, shape=[-1, hidden_size]) user_re = fluid.layers.reshape(x=user_exp, shape=[-1, hidden_size])
all_item_hid = fluid.layers.fc(input=all_item_emb_re, all_item_hid = fluid.layers.fc(input=all_item_emb_re,
......
...@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs ...@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters") self.vocab_size = envs.get_global_env("vocab_size", 10,
"train.model.hyper_parameters")
def generate_sample(self, line): def generate_sample(self, line):
""" """
...@@ -39,6 +40,9 @@ class EvaluateReader(Reader): ...@@ -39,6 +40,9 @@ class EvaluateReader(Reader):
src = conv_ids[:boundary] src = conv_ids[:boundary]
pos_tgt = [conv_ids[boundary]] pos_tgt = [conv_ids[boundary]]
feature_name = ["user", "all_item", "p_item"] feature_name = ["user", "all_item", "p_item"]
yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt]) yield zip(
feature_name,
[src] + [np.arange(self.vocab_size).astype("int64").tolist()] +
[pos_tgt])
return reader return reader
...@@ -24,46 +24,57 @@ class Model(ModelBase): ...@@ -24,46 +24,57 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def input(self): def input(self):
neg_num = int(envs.get_global_env( neg_num = int(
"hyper_parameters.neg_num", None, self._namespace)) envs.get_global_env("hyper_parameters.neg_num", None,
self.input_word = fluid.data(name="input_word", shape=[ self._namespace))
None, 1], dtype='int64') self.input_word = fluid.data(
self.true_word = fluid.data(name='true_label', shape=[ name="input_word", shape=[None, 1], dtype='int64')
None, 1], dtype='int64') self.true_word = fluid.data(
name='true_label', shape=[None, 1], dtype='int64')
self._data_var.append(self.input_word) self._data_var.append(self.input_word)
self._data_var.append(self.true_word) self._data_var.append(self.true_word)
with_shuffle_batch = bool(int(envs.get_global_env( with_shuffle_batch = bool(
"hyper_parameters.with_shuffle_batch", None, self._namespace))) int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
if not with_shuffle_batch: if not with_shuffle_batch:
self.neg_word = fluid.data(name="neg_label", shape=[ self.neg_word = fluid.data(
None, neg_num], dtype='int64') name="neg_label", shape=[None, neg_num], dtype='int64')
self._data_var.append(self.neg_word) self._data_var.append(self.neg_word)
if self._platform != "LINUX": if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self): def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
neg_num = int(envs.get_global_env( neg_num = int(
"hyper_parameters.neg_num", None, self._namespace)) envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
sparse_feature_number = envs.get_global_env( sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace) "hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env( sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace) "hyper_parameters.sparse_feature_dim", None, self._namespace)
with_shuffle_batch = bool(int(envs.get_global_env( with_shuffle_batch = bool(
"hyper_parameters.with_shuffle_batch", None, self._namespace))) int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
def embedding_layer(input, table_name, emb_dim, initializer_instance=None, squeeze=False): def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None,
squeeze=False):
emb = fluid.embedding( emb = fluid.embedding(
input=input, input=input,
is_sparse=True, is_sparse=True,
is_distributed=is_distributed, is_distributed=is_distributed,
size=[sparse_feature_number, emb_dim], size=[sparse_feature_number, emb_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=table_name, name=table_name, initializer=initializer_instance), )
initializer=initializer_instance),
)
if squeeze: if squeeze:
return fluid.layers.squeeze(input=emb, axes=[1]) return fluid.layers.squeeze(input=emb, axes=[1])
else: else:
...@@ -73,35 +84,38 @@ class Model(ModelBase): ...@@ -73,35 +84,38 @@ class Model(ModelBase):
emb_initializer = fluid.initializer.Uniform(-init_width, init_width) emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0) emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer( input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim,
self.input_word, "emb", sparse_feature_dim, emb_initializer, True) emb_initializer, True)
true_emb_w = embedding_layer( true_emb_w = embedding_layer(self.true_word, "emb_w",
self.true_word, "emb_w", sparse_feature_dim, emb_w_initializer, True) sparse_feature_dim, emb_w_initializer,
true_emb_b = embedding_layer( True)
self.true_word, "emb_b", 1, emb_w_initializer, True) true_emb_b = embedding_layer(self.true_word, "emb_b", 1,
emb_w_initializer, True)
if with_shuffle_batch: if with_shuffle_batch:
neg_emb_w_list = [] neg_emb_w_list = []
for i in range(neg_num): for i in range(neg_num):
neg_emb_w_list.append(fluid.contrib.layers.shuffle_batch( neg_emb_w_list.append(
true_emb_w)) # shuffle true_word fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0) neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape( neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim]) neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim])
neg_emb_b_list = [] neg_emb_b_list = []
for i in range(neg_num): for i in range(neg_num):
neg_emb_b_list.append(fluid.contrib.layers.shuffle_batch( neg_emb_b_list.append(
true_emb_b)) # shuffle true_word fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0) neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
neg_emb_b_vec = fluid.layers.reshape( neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num]) neg_emb_b, shape=[-1, neg_num])
else: else:
neg_emb_w = embedding_layer( neg_emb_w = embedding_layer(self.neg_word, "emb_w",
self.neg_word, "emb_w", sparse_feature_dim, emb_w_initializer) sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer( neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1,
self.neg_word, "emb_b", 1, emb_w_initializer) emb_w_initializer)
neg_emb_b_vec = fluid.layers.reshape( neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num]) neg_emb_b, shape=[-1, neg_num])
...@@ -117,7 +131,8 @@ class Model(ModelBase): ...@@ -117,7 +131,8 @@ class Model(ModelBase):
neg_matmul = fluid.layers.matmul( neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True) input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.elementwise_add( neg_logits = fluid.layers.elementwise_add(
fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]), fluid.layers.reshape(
neg_matmul, shape=[-1, neg_num]),
neg_emb_b_vec) neg_emb_b_vec)
label_ones = fluid.layers.fill_constant_batch_size_like( label_ones = fluid.layers.fill_constant_batch_size_like(
...@@ -136,9 +151,17 @@ class Model(ModelBase): ...@@ -136,9 +151,17 @@ class Model(ModelBase):
neg_xent, dim=1)) neg_xent, dim=1))
self.avg_cost = fluid.layers.reduce_mean(cost) self.avg_cost = fluid.layers.reduce_mean(cost)
global_right_cnt = fluid.layers.create_global_var( global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0) name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var( global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0) name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True global_total_cnt.stop_gradient = True
...@@ -155,12 +178,12 @@ class Model(ModelBase): ...@@ -155,12 +178,12 @@ class Model(ModelBase):
self.metrics() self.metrics()
def optimizer(self): def optimizer(self):
learning_rate = envs.get_global_env( learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
"hyper_parameters.learning_rate", None, self._namespace) None, self._namespace)
decay_steps = envs.get_global_env( decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
"hyper_parameters.decay_steps", None, self._namespace) self._namespace)
decay_rate = envs.get_global_env( decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
"hyper_parameters.decay_rate", None, self._namespace) self._namespace)
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
learning_rate=learning_rate, learning_rate=learning_rate,
...@@ -180,11 +203,15 @@ class Model(ModelBase): ...@@ -180,11 +203,15 @@ class Model(ModelBase):
name="analogy_c", shape=[None], dtype='int64') name="analogy_c", shape=[None], dtype='int64')
self.analogy_d = fluid.data( self.analogy_d = fluid.data(
name="analogy_d", shape=[None], dtype='int64') name="analogy_d", shape=[None], dtype='int64')
self._infer_data_var = [self.analogy_a, self._infer_data_var = [
self.analogy_b, self.analogy_c, self.analogy_d] self.analogy_a, self.analogy_b, self.analogy_c, self.analogy_d
]
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self): def infer_net(self):
sparse_feature_dim = envs.get_global_env( sparse_feature_dim = envs.get_global_env(
...@@ -216,18 +243,28 @@ class Model(ModelBase): ...@@ -216,18 +243,28 @@ class Model(ModelBase):
dist = fluid.layers.matmul( dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True) x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4) values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(fluid.layers.unsqueeze( label = fluid.layers.expand(
self.analogy_d, axes=[1]), expand_times=[1, 4]) fluid.layers.unsqueeze(
self.analogy_d, axes=[1]),
expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like( label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32') label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum( right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
input=fluid.layers.cast(fluid.layers.equal(pred_idx, label), dtype='float32')) fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones) total_cnt = fluid.layers.reduce_sum(label_ones)
global_right_cnt = fluid.layers.create_global_var( global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0) name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var( global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0) name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True global_total_cnt.stop_gradient = True
......
...@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta ...@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
tar xzvf test_dir.tar -C raw_data tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir test_data/ mv raw_data/data/test_dir test_data/
rm -rf raw_data rm -rf raw_data
...@@ -49,8 +49,7 @@ def parse_args(): ...@@ -49,8 +49,7 @@ def parse_args():
'--file_nums', '--file_nums',
type=int, type=int,
default=1024, default=1024,
help="re-split input corpus file nums" help="re-split input corpus file nums")
)
parser.add_argument( parser.add_argument(
'--downsample', '--downsample',
type=float, type=float,
...@@ -137,9 +136,11 @@ def filter_corpus(args): ...@@ -137,9 +136,11 @@ def filter_corpus(args):
if not os.path.exists(args.output_corpus_dir): if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir) os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir): for file in os.listdir(args.input_corpus_dir):
with io.open(args.output_corpus_dir + '/convert_' + file + '.csv', "w") as wf: with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
"w") as wf:
with io.open( with io.open(
args.input_corpus_dir + '/' + file, encoding='utf-8') as rf: args.input_corpus_dir + '/' + file,
encoding='utf-8') as rf:
print(args.input_corpus_dir + '/' + file) print(args.input_corpus_dir + '/' + file)
for line in rf: for line in rf:
signal = False signal = False
...@@ -154,9 +155,9 @@ def filter_corpus(args): ...@@ -154,9 +155,9 @@ def filter_corpus(args):
count_w = id_counts[idx] count_w = id_counts[idx]
corpus_size = word_all_count corpus_size = word_all_count
keep_prob = ( keep_prob = (
math.sqrt(count_w / math.sqrt(count_w /
(args.downsample * corpus_size)) + 1 (args.downsample * corpus_size)) + 1
) * (args.downsample * corpus_size) / count_w ) * (args.downsample * corpus_size) / count_w
r_value = random.random() r_value = random.random()
if r_value > keep_prob: if r_value > keep_prob:
continue continue
...@@ -182,7 +183,8 @@ def build_dict(args): ...@@ -182,7 +183,8 @@ def build_dict(args):
for file in os.listdir(args.build_dict_corpus_dir): for file in os.listdir(args.build_dict_corpus_dir):
with io.open( with io.open(
args.build_dict_corpus_dir + "/" + file, encoding='utf-8') as f: args.build_dict_corpus_dir + "/" + file,
encoding='utf-8') as f:
print("build dict : ", args.build_dict_corpus_dir + "/" + file) print("build dict : ", args.build_dict_corpus_dir + "/" + file)
for line in f: for line in f:
line = text_strip(line) line = text_strip(line)
...@@ -232,7 +234,8 @@ def data_split(args): ...@@ -232,7 +234,8 @@ def data_split(args):
for i in range(1, num + 1): for i in range(1, num + 1):
with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout: with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))] data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
len(contents))]
for line in data: for line in data:
fout.write(line) fout.write(line)
......
...@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs ...@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader): class EvaluateReader(Reader):
def init(self): def init(self):
dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader") dict_path = envs.get_global_env("word_id_dict_path", None,
"evaluate.reader")
self.word_to_id = dict() self.word_to_id = dict()
self.id_to_word = dict() self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f: with io.open(dict_path, 'r', encoding='utf-8') as f:
...@@ -68,14 +69,17 @@ class EvaluateReader(Reader): ...@@ -68,14 +69,17 @@ class EvaluateReader(Reader):
a unicode string - a space-delimited sequence of words. a unicode string - a space-delimited sequence of words.
""" """
return u" ".join([ return u" ".join([
word if word in original_vocab else u"<UNK>" for word in line.split() word if word in original_vocab else u"<UNK>"
for word in line.split()
]) ])
def generate_sample(self, line): def generate_sample(self, line):
def reader(): def reader():
features = self.strip_lines(line.lower(), self.word_to_id) features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split() features = features.split()
yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), yield [('analogy_a', [self.word_to_id[features[0]]]),
('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])] ('analogy_b', [self.word_to_id[features[1]]]),
('analogy_c', [self.word_to_id[features[2]]]),
('analogy_d', [self.word_to_id[features[3]]])]
return reader return reader
...@@ -40,10 +40,14 @@ class NumpyRandomInt(object): ...@@ -40,10 +40,14 @@ class NumpyRandomInt(object):
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader") dict_path = envs.get_global_env("word_count_dict_path", None,
self.window_size = envs.get_global_env("hyper_parameters.window_size", None, "train.model") "train.reader")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, "train.model") self.window_size = envs.get_global_env("hyper_parameters.window_size",
self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch", None, "train.model") None, "train.model")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None,
"train.model")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, "train.model")
self.random_generator = NumpyRandomInt(1, self.window_size + 1) self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.cs = None self.cs = None
...@@ -81,13 +85,15 @@ class TrainReader(Reader): ...@@ -81,13 +85,15 @@ class TrainReader(Reader):
def reader(): def reader():
word_ids = [w for w in line.split()] word_ids = [w for w in line.split()]
for idx, target_id in enumerate(word_ids): for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words( context_word_ids = self.get_context_words(word_ids, idx)
word_ids, idx)
for context_id in context_word_ids: for context_id in context_word_ids:
output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])] output = [('input_word', [int(target_id)]),
('true_label', [int(context_id)])]
if not self.with_shuffle_batch: if not self.with_shuffle_batch:
neg_array = self.cs.searchsorted(np.random.sample(self.neg_num)) neg_array = self.cs.searchsorted(
output += [('neg_label', [int(str(i)) for i in neg_array])] np.random.sample(self.neg_num))
output += [('neg_label',
[int(str(i)) for i in neg_array])]
yield output yield output
return reader return reader
...@@ -25,14 +25,20 @@ class Model(ModelBase): ...@@ -25,14 +25,20 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
def input_data(self, is_infer=False): def input_data(self, is_infer=False):
watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, self._namespace) watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size",
search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, self._namespace) None, self._namespace)
other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, self._namespace) search_vec_size = envs.get_global_env(
"hyper_parameters.search_vec_size", None, self._namespace)
watch_vec = fluid.data(name="watch_vec", shape=[None, watch_vec_size], dtype="float32") other_feat_size = envs.get_global_env(
search_vec = fluid.data(name="search_vec", shape=[None, search_vec_size], dtype="float32") "hyper_parameters.other_feat_size", None, self._namespace)
other_feat = fluid.data(name="other_feat", shape=[None, other_feat_size], dtype="float32")
watch_vec = fluid.data(
name="watch_vec", shape=[None, watch_vec_size], dtype="float32")
search_vec = fluid.data(
name="search_vec", shape=[None, search_vec_size], dtype="float32")
other_feat = fluid.data(
name="other_feat", shape=[None, other_feat_size], dtype="float32")
label = fluid.data(name="label", shape=[None, 1], dtype="int64") label = fluid.data(name="label", shape=[None, 1], dtype="int64")
inputs = [watch_vec] + [search_vec] + [other_feat] + [label] inputs = [watch_vec] + [search_vec] + [other_feat] + [label]
self._data_var = inputs self._data_var = inputs
...@@ -41,27 +47,32 @@ class Model(ModelBase): ...@@ -41,27 +47,32 @@ class Model(ModelBase):
def fc(self, tag, data, out_dim, active='relu'): def fc(self, tag, data, out_dim, active='relu'):
init_stddev = 1.0 init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1]) scales = 1.0 / np.sqrt(data.shape[1])
if tag == 'l4': if tag == 'l4':
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, p_attr = fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales)) name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
else: else:
p_attr = None p_attr = None
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data, out = fluid.layers.fc(input=data,
size=out_dim, size=out_dim,
act=active, act=active,
param_attr=p_attr, param_attr=p_attr,
bias_attr =b_attr, bias_attr=b_attr,
name=tag) name=tag)
return out return out
def net(self, inputs): def net(self, inputs):
output_size = envs.get_global_env("hyper_parameters.output_size", None, self._namespace) output_size = envs.get_global_env("hyper_parameters.output_size", None,
layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace) self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None,
self._namespace)
concat_feats = fluid.layers.concat(input=inputs[:-1], axis=-1) concat_feats = fluid.layers.concat(input=inputs[:-1], axis=-1)
l1 = self.fc('l1', concat_feats, layers[0], 'relu') l1 = self.fc('l1', concat_feats, layers[0], 'relu')
......
...@@ -21,10 +21,14 @@ import numpy as np ...@@ -21,10 +21,14 @@ import numpy as np
class TrainReader(Reader): class TrainReader(Reader):
def init(self): def init(self):
self.watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, "train.model") self.watch_vec_size = envs.get_global_env(
self.search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, "train.model") "hyper_parameters.watch_vec_size", None, "train.model")
self.other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, "train.model") self.search_vec_size = envs.get_global_env(
self.output_size = envs.get_global_env("hyper_parameters.output_size", None, "train.model") "hyper_parameters.search_vec_size", None, "train.model")
self.other_feat_size = envs.get_global_env(
"hyper_parameters.other_feat_size", None, "train.model")
self.output_size = envs.get_global_env("hyper_parameters.output_size",
None, "train.model")
def generate_sample(self, line): def generate_sample(self, line):
""" """
...@@ -35,13 +39,12 @@ class TrainReader(Reader): ...@@ -35,13 +39,12 @@ class TrainReader(Reader):
""" """
This function needs to be implemented by the user, based on data format This function needs to be implemented by the user, based on data format
""" """
feature_name = ["watch_vec", "search_vec", "other_feat", "label"] feature_name = ["watch_vec", "search_vec", "other_feat", "label"]
yield zip(feature_name, [np.random.rand(self.watch_vec_size).tolist()] + yield zip(feature_name,
[np.random.rand(self.search_vec_size).tolist()] + [np.random.rand(self.watch_vec_size).tolist()] +
[np.random.rand(self.other_feat_size).tolist()] + [np.random.rand(self.search_vec_size).tolist()] +
[[np.random.randint(self.output_size)]] ) [np.random.rand(self.other_feat_size).tolist()] +
[[np.random.randint(self.output_size)]])
return reader return reader
...@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效 ...@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效
- 如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。 - 如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。
- 训练数据如何组织?答:tdm的训练数据主要为:`user/query emb``item`的正样本,`item`需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的`tdm-sampler op`完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。 - 训练数据如何组织?答:tdm的训练数据主要为:`user/query emb``item`的正样本,`item`需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的`tdm-sampler op`完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。
- 大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。 - 大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。
3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。 3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -25,38 +25,38 @@ class Model(ModelBase): ...@@ -25,38 +25,38 @@ class Model(ModelBase):
def __init__(self, config): def __init__(self, config):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
# tree meta hyper parameters # tree meta hyper parameters
self.max_layers = envs.get_global_env( self.max_layers = envs.get_global_env("tree_parameters.max_layers", 4,
"tree_parameters.max_layers", 4, self._namespace) self._namespace)
self.node_nums = envs.get_global_env( self.node_nums = envs.get_global_env("tree_parameters.node_nums", 26,
"tree_parameters.node_nums", 26, self._namespace) self._namespace)
self.leaf_node_nums = envs.get_global_env( self.leaf_node_nums = envs.get_global_env(
"tree_parameters.leaf_node_nums", 13, self._namespace) "tree_parameters.leaf_node_nums", 13, self._namespace)
self.output_positive = envs.get_global_env( self.output_positive = envs.get_global_env(
"tree_parameters.output_positive", True, self._namespace) "tree_parameters.output_positive", True, self._namespace)
self.layer_node_num_list = envs.get_global_env( self.layer_node_num_list = envs.get_global_env(
"tree_parameters.layer_node_num_list", [ "tree_parameters.layer_node_num_list", [2, 4, 7,
2, 4, 7, 12], self._namespace) 12], self._namespace)
self.child_nums = envs.get_global_env( self.child_nums = envs.get_global_env("tree_parameters.child_nums", 2,
"tree_parameters.child_nums", 2, self._namespace) self._namespace)
self.tree_layer_path = envs.get_global_env( self.tree_layer_path = envs.get_global_env("tree.tree_layer_path",
"tree.tree_layer_path", None, "train.startup") None, "train.startup")
# model training hyper parameter # model training hyper parameter
self.node_emb_size = envs.get_global_env( self.node_emb_size = envs.get_global_env(
"hyper_parameters.node_emb_size", 64, self._namespace) "hyper_parameters.node_emb_size", 64, self._namespace)
self.input_emb_size = envs.get_global_env( self.input_emb_size = envs.get_global_env(
"hyper_parameters.input_emb_size", 768, self._namespace) "hyper_parameters.input_emb_size", 768, self._namespace)
self.act = envs.get_global_env( self.act = envs.get_global_env("hyper_parameters.act", "tanh",
"hyper_parameters.act", "tanh", self._namespace) self._namespace)
self.neg_sampling_list = envs.get_global_env( self.neg_sampling_list = envs.get_global_env(
"hyper_parameters.neg_sampling_list", [ "hyper_parameters.neg_sampling_list", [1, 2, 3,
1, 2, 3, 4], self._namespace) 4], self._namespace)
# model infer hyper parameter # model infer hyper parameter
self.topK = envs.get_global_env( self.topK = envs.get_global_env("hyper_parameters.node_nums", 1,
"hyper_parameters.node_nums", 1, self._namespace) self._namespace)
self.batch_size = envs.get_global_env( self.batch_size = envs.get_global_env("batch_size", 1,
"batch_size", 1, "evaluate.reader") "evaluate.reader")
def train_net(self): def train_net(self):
self.train_input() self.train_input()
...@@ -76,21 +76,22 @@ class Model(ModelBase): ...@@ -76,21 +76,22 @@ class Model(ModelBase):
input_emb = fluid.data( input_emb = fluid.data(
name="input_emb", name="input_emb",
shape=[None, self.input_emb_size], shape=[None, self.input_emb_size],
dtype="float32", dtype="float32", )
)
self._data_var.append(input_emb) self._data_var.append(input_emb)
item_label = fluid.data( item_label = fluid.data(
name="item_label", name="item_label",
shape=[None, 1], shape=[None, 1],
dtype="int64", dtype="int64", )
)
self._data_var.append(item_label) self._data_var.append(item_label)
if self._platform != "LINUX": if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator( self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def tdm_net(self): def tdm_net(self):
""" """
...@@ -116,8 +117,7 @@ class Model(ModelBase): ...@@ -116,8 +117,7 @@ class Model(ModelBase):
output_list=True, output_list=True,
seed=0, seed=0,
tree_dtype='int64', tree_dtype='int64',
dtype='int64' dtype='int64')
)
# 查表得到每个节点的Embedding # 查表得到每个节点的Embedding
sample_nodes_emb = [ sample_nodes_emb = [
...@@ -125,35 +125,34 @@ class Model(ModelBase): ...@@ -125,35 +125,34 @@ class Model(ModelBase):
input=sample_nodes[i], input=sample_nodes[i],
is_sparse=True, is_sparse=True,
size=[self.node_nums, self.node_emb_size], size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
name="TDM_Tree_Emb") for i in range(self.max_layers)
) for i in range(self.max_layers)
] ]
# 此处进行Reshape是为了之后层次化的分类器训练 # 此处进行Reshape是为了之后层次化的分类器训练
sample_nodes_emb = [ sample_nodes_emb = [
fluid.layers.reshape(sample_nodes_emb[i], fluid.layers.reshape(sample_nodes_emb[i], [
[-1, self.neg_sampling_list[i] + -1, self.neg_sampling_list[i] + self.output_positive,
self.output_positive, self.node_emb_size] self.node_emb_size
) for i in range(self.max_layers) ]) for i in range(self.max_layers)
] ]
# 对输入的input_emb进行转换,使其维度与node_emb维度一致 # 对输入的input_emb进行转换,使其维度与node_emb维度一致
input_trans_emb = self.input_trans_layer(input_emb) input_trans_emb = self.input_trans_layer(input_emb)
# 分类器的主体网络,分别训练不同层次的分类器 # 分类器的主体网络,分别训练不同层次的分类器
layer_classifier_res = self.classifier_layer( layer_classifier_res = self.classifier_layer(input_trans_emb,
input_trans_emb, sample_nodes_emb) sample_nodes_emb)
# 最后的概率判别FC,将所有层次的node分类结果放到一起以相同的标准进行判别 # 最后的概率判别FC,将所有层次的node分类结果放到一起以相同的标准进行判别
# 考虑到树极大可能不平衡,有些item不在最后一层,所以需要这样的机制保证每个item都有机会被召回 # 考虑到树极大可能不平衡,有些item不在最后一层,所以需要这样的机制保证每个item都有机会被召回
tdm_fc = fluid.layers.fc(input=layer_classifier_res, tdm_fc = fluid.layers.fc(
size=2, input=layer_classifier_res,
act=None, size=2,
num_flatten_dims=2, act=None,
param_attr=fluid.ParamAttr( num_flatten_dims=2,
name="tdm.cls_fc.weight"), param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias")) bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
# 将loss打平,放到一起计算整体网络的loss # 将loss打平,放到一起计算整体网络的loss
tdm_fc_re = fluid.layers.reshape(tdm_fc, [-1, 2]) tdm_fc_re = fluid.layers.reshape(tdm_fc, [-1, 2])
...@@ -202,7 +201,7 @@ class Model(ModelBase): ...@@ -202,7 +201,7 @@ class Model(ModelBase):
def metrics(self): def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self._predict, auc, batch_auc, _ = fluid.layers.auc(input=self._predict,
label=self.mask_label, label=self.mask_label,
num_thresholds=2 ** 12, num_thresholds=2**12,
slide_steps=20) slide_steps=20)
self._metrics["AUC"] = auc self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc self._metrics["BATCH_AUC"] = batch_auc
...@@ -218,8 +217,7 @@ class Model(ModelBase): ...@@ -218,8 +217,7 @@ class Model(ModelBase):
size=self.node_emb_size, size=self.node_emb_size,
act=None, act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"), param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
)
# 将input_emb映射到各个不同层次的向量表示空间 # 将input_emb映射到各个不同层次的向量表示空间
input_layer_fc_out = [ input_layer_fc_out = [
...@@ -229,8 +227,9 @@ class Model(ModelBase): ...@@ -229,8 +227,9 @@ class Model(ModelBase):
act=self.act, act=self.act,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(i)), name="trans.layer_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)), bias_attr=fluid.ParamAttr(
) for i in range(self.max_layers) name="trans.layer_fc.bias." + str(i)), )
for i in range(self.max_layers)
] ]
return input_layer_fc_out return input_layer_fc_out
...@@ -246,20 +245,22 @@ class Model(ModelBase): ...@@ -246,20 +245,22 @@ class Model(ModelBase):
input_layer_unsequeeze, expand_times=[1, node.shape[1], 1]) input_layer_unsequeeze, expand_times=[1, node.shape[1], 1])
else: else:
input_layer_expand = fluid.layers.expand( input_layer_expand = fluid.layers.expand(
input_layer_unsequeeze, expand_times=[1, node[layer_idx].shape[1], 1]) input_layer_unsequeeze,
expand_times=[1, node[layer_idx].shape[1], 1])
return input_layer_expand return input_layer_expand
def classifier_layer(self, input, node): def classifier_layer(self, input, node):
# 扩展input,使维度与node匹配 # 扩展input,使维度与node匹配
input_expand = [ input_expand = [
self._expand_layer(input[i], node, i) for i in range(self.max_layers) self._expand_layer(input[i], node, i)
for i in range(self.max_layers)
] ]
# 将input_emb与node_emb concat到一起过分类器FC # 将input_emb与node_emb concat到一起过分类器FC
input_node_concat = [ input_node_concat = [
fluid.layers.concat( fluid.layers.concat(
input=[input_expand[i], node[i]], input=[input_expand[i], node[i]], axis=2)
axis=2) for i in range(self.max_layers) for i in range(self.max_layers)
] ]
hidden_states_fc = [ hidden_states_fc = [
fluid.layers.fc( fluid.layers.fc(
...@@ -269,8 +270,8 @@ class Model(ModelBase): ...@@ -269,8 +270,8 @@ class Model(ModelBase):
act=self.act, act=self.act,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(i)), name="cls.concat_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)) bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)))
) for i in range(self.max_layers) for i in range(self.max_layers)
] ]
# 如果将所有层次的node放到一起计算loss,则需要在此处concat # 如果将所有层次的node放到一起计算loss,则需要在此处concat
...@@ -285,12 +286,14 @@ class Model(ModelBase): ...@@ -285,12 +286,14 @@ class Model(ModelBase):
input_emb = fluid.layers.data( input_emb = fluid.layers.data(
name="input_emb", name="input_emb",
shape=[self.input_emb_size], shape=[self.input_emb_size],
dtype="float32", dtype="float32", )
)
self._infer_data_var.append(input_emb) self._infer_data_var.append(input_emb)
self._infer_data_loader = fluid.io.DataLoader.from_generator( self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_layer_list(self): def get_layer_list(self):
"""get layer list from layer_list.txt""" """get layer list from layer_list.txt"""
...@@ -318,10 +321,12 @@ class Model(ModelBase): ...@@ -318,10 +321,12 @@ class Model(ModelBase):
node_list = [] node_list = []
mask_list = [] mask_list = []
for id in first_layer_node: for id in first_layer_node:
node_list.append(fluid.layers.fill_constant( node_list.append(
[self.batch_size, 1], value=int(id), dtype='int64')) fluid.layers.fill_constant(
mask_list.append(fluid.layers.fill_constant( [self.batch_size, 1], value=int(id), dtype='int64'))
[self.batch_size, 1], value=0, dtype='int64')) mask_list.append(
fluid.layers.fill_constant(
[self.batch_size, 1], value=0, dtype='int64'))
self.first_layer_node = fluid.layers.concat(node_list, axis=1) self.first_layer_node = fluid.layers.concat(node_list, axis=1)
self.first_layer_node_mask = fluid.layers.concat(mask_list, axis=1) self.first_layer_node_mask = fluid.layers.concat(mask_list, axis=1)
...@@ -359,28 +364,26 @@ class Model(ModelBase): ...@@ -359,28 +364,26 @@ class Model(ModelBase):
size=[self.node_nums, self.node_emb_size], size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr(name="TDM_Tree_Emb")) param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
input_fc_out = self.layer_fc_infer( input_fc_out = self.layer_fc_infer(input_trans_emb, layer_idx)
input_trans_emb, layer_idx)
# 过每一层的分类器 # 过每一层的分类器
layer_classifier_res = self.classifier_layer_infer(input_fc_out, layer_classifier_res = self.classifier_layer_infer(
node_emb, input_fc_out, node_emb, layer_idx)
layer_idx)
# 过最终的判别分类器 # 过最终的判别分类器
tdm_fc = fluid.layers.fc(input=layer_classifier_res, tdm_fc = fluid.layers.fc(
size=2, input=layer_classifier_res,
act=None, size=2,
num_flatten_dims=2, act=None,
param_attr=fluid.ParamAttr( num_flatten_dims=2,
name="tdm.cls_fc.weight"), param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias")) bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
prob = fluid.layers.softmax(tdm_fc) prob = fluid.layers.softmax(tdm_fc)
positive_prob = fluid.layers.slice( positive_prob = fluid.layers.slice(
prob, axes=[2], starts=[1], ends=[2]) prob, axes=[2], starts=[1], ends=[2])
prob_re = fluid.layers.reshape( prob_re = fluid.layers.reshape(positive_prob,
positive_prob, [-1, current_layer_node_num]) [-1, current_layer_node_num])
# 过滤掉padding产生的无效节点(node_id=0) # 过滤掉padding产生的无效节点(node_id=0)
node_zero_mask = fluid.layers.cast(current_layer_node, 'bool') node_zero_mask = fluid.layers.cast(current_layer_node, 'bool')
...@@ -395,11 +398,11 @@ class Model(ModelBase): ...@@ -395,11 +398,11 @@ class Model(ModelBase):
# index_sample op根据下标索引tensor对应位置的值 # index_sample op根据下标索引tensor对应位置的值
# 若paddle版本>2.0,调用方式为paddle.index_sample # 若paddle版本>2.0,调用方式为paddle.index_sample
top_node = fluid.contrib.layers.index_sample( top_node = fluid.contrib.layers.index_sample(current_layer_node,
current_layer_node, topk_i) topk_i)
prob_re_mask = prob_re * current_layer_node_mask # 过滤掉非叶子节点 prob_re_mask = prob_re * current_layer_node_mask # 过滤掉非叶子节点
topk_value = fluid.contrib.layers.index_sample( topk_value = fluid.contrib.layers.index_sample(prob_re_mask,
prob_re_mask, topk_i) topk_i)
node_score.append(topk_value) node_score.append(topk_value)
node_list.append(top_node) node_list.append(top_node)
...@@ -424,7 +427,8 @@ class Model(ModelBase): ...@@ -424,7 +427,8 @@ class Model(ModelBase):
res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1]) res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1])
# 利用Tree_info信息,将node_id转换为item_id # 利用Tree_info信息,将node_id转换为item_id
tree_info = fluid.default_main_program().global_block().var("TDM_Tree_Info") tree_info = fluid.default_main_program().global_block().var(
"TDM_Tree_Info")
res_node_emb = fluid.layers.gather_nd(tree_info, res_node) res_node_emb = fluid.layers.gather_nd(tree_info, res_node)
res_item = fluid.layers.slice( res_item = fluid.layers.slice(
...@@ -442,8 +446,7 @@ class Model(ModelBase): ...@@ -442,8 +446,7 @@ class Model(ModelBase):
size=self.node_emb_size, size=self.node_emb_size,
act=None, act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"), param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
)
return input_fc_out return input_fc_out
def layer_fc_infer(self, input_fc_out, layer_idx): def layer_fc_infer(self, input_fc_out, layer_idx):
...@@ -458,8 +461,7 @@ class Model(ModelBase): ...@@ -458,8 +461,7 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(layer_idx)), name="trans.layer_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr( bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias." + str(layer_idx)), name="trans.layer_fc.bias." + str(layer_idx)), )
)
return input_layer_fc_out return input_layer_fc_out
def classifier_layer_infer(self, input, node, layer_idx): def classifier_layer_infer(self, input, node, layer_idx):
...@@ -480,5 +482,6 @@ class Model(ModelBase): ...@@ -480,5 +482,6 @@ class Model(ModelBase):
act=self.act, act=self.act,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(layer_idx)), name="cls.concat_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx))) bias_attr=fluid.ParamAttr(
name="cls.concat_fc.bias." + str(layer_idx)))
return hidden_states_fc return hidden_states_fc
1,2 1,2
3,4,5,6 3,4,5,6
7,8,9,10,11,12,13 7,8,9,10,11,12,13
14,15,16,17,18,19,20,21,22,23,24,25 14,15,16,17,18,19,20,21,22,23,24,25
\ No newline at end of file
...@@ -26,8 +26,10 @@ from paddlerec.core.utils import util ...@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
engines = {} engines = {}
device = ["CPU", "GPU"] device = ["CPU", "GPU"]
clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"] clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"]
engine_choices = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER", engine_choices = [
"TDM_SINGLE", "TDM_LOCAL_CLUSTER", "TDM_CLUSTER"] "SINGLE", "LOCAL_CLUSTER", "CLUSTER", "TDM_SINGLE", "TDM_LOCAL_CLUSTER",
"TDM_CLUSTER"
]
custom_model = ['TDM'] custom_model = ['TDM']
model_name = "" model_name = ""
...@@ -66,7 +68,8 @@ def get_engine(args): ...@@ -66,7 +68,8 @@ def get_engine(args):
engine = engine.upper() engine = engine.upper()
if engine not in engine_choices: if engine not in engine_choices:
raise ValueError("train.engin can not be chosen in {}".format(engine_choices)) raise ValueError("train.engin can not be chosen in {}".format(
engine_choices))
print("engines: \n{}".format(engines)) print("engines: \n{}".format(engines))
...@@ -77,8 +80,10 @@ def get_engine(args): ...@@ -77,8 +80,10 @@ def get_engine(args):
def get_transpiler(): def get_transpiler():
FNULL = open(os.devnull, 'w') FNULL = open(os.devnull, 'w')
cmd = ["python", "-c", cmd = [
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"] "python", "-c",
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
proc = subprocess.Popen(cmd, stdout=FNULL, stderr=FNULL, cwd=os.getcwd()) proc = subprocess.Popen(cmd, stdout=FNULL, stderr=FNULL, cwd=os.getcwd())
ret = proc.wait() ret = proc.wait()
if ret == -11: if ret == -11:
...@@ -152,7 +157,8 @@ def cluster_engine(args): ...@@ -152,7 +157,8 @@ def cluster_engine(args):
update_workspace(flattens) update_workspace(flattens)
envs.set_runtime_environs(flattens) envs.set_runtime_environs(flattens)
print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"))) print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"
)))
launch = ClusterEngine(None, args.model) launch = ClusterEngine(None, args.model)
return launch return launch
...@@ -163,7 +169,8 @@ def cluster_engine(args): ...@@ -163,7 +169,8 @@ def cluster_engine(args):
cluster_envs = {} cluster_envs = {}
cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.trainer"] = trainer
cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.engine"] = "cluster"
cluster_envs["train.trainer.threads"] = envs.get_runtime_environ("CPU_NUM") cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
"CPU_NUM")
cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["train.trainer.platform"] = envs.get_platform()
print("launch {} engine with cluster to with model: {}".format( print("launch {} engine with cluster to with model: {}".format(
trainer, args.model)) trainer, args.model))
...@@ -181,7 +188,8 @@ def cluster_engine(args): ...@@ -181,7 +188,8 @@ def cluster_engine(args):
def cluster_mpi_engine(args): def cluster_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model)) print("launch cluster engine with cluster to run model: {}".format(
args.model))
cluster_envs = {} cluster_envs = {}
cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer" cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer"
...@@ -209,7 +217,8 @@ def local_cluster_engine(args): ...@@ -209,7 +217,8 @@ def local_cluster_engine(args):
cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["train.trainer.platform"] = envs.get_platform()
cluster_envs["CPU_NUM"] = "2" cluster_envs["CPU_NUM"] = "2"
print("launch {} engine with cluster to run model: {}".format(trainer, args.model)) print("launch {} engine with cluster to run model: {}".format(trainer,
args.model))
set_runtime_envs(cluster_envs, args.model) set_runtime_envs(cluster_envs, args.model)
launch = LocalClusterEngine(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model)
...@@ -217,10 +226,12 @@ def local_cluster_engine(args): ...@@ -217,10 +226,12 @@ def local_cluster_engine(args):
def local_mpi_engine(args): def local_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model)) print("launch cluster engine with cluster to run model: {}".format(
args.model))
from paddlerec.core.engine.local_mpi import LocalMPIEngine from paddlerec.core.engine.local_mpi import LocalMPIEngine
print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(args.model)) print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(
args.model))
mpi = util.run_which("mpirun") mpi = util.run_which("mpirun")
if not mpi: if not mpi:
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
setup for paddle-rec. setup for paddle-rec.
""" """
...@@ -22,11 +21,7 @@ from setuptools import setup, find_packages ...@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
import shutil import shutil
import tempfile import tempfile
requires = ["paddlepaddle == 1.7.2", "pyyaml >= 5.1.1"]
requires = [
"paddlepaddle == 1.7.2",
"pyyaml >= 5.1.1"
]
about = {} about = {}
about["__title__"] = "paddle-rec" about["__title__"] = "paddle-rec"
...@@ -48,18 +43,27 @@ def build(dirname): ...@@ -48,18 +43,27 @@ def build(dirname):
package_dir = os.path.dirname(os.path.abspath(__file__)) package_dir = os.path.dirname(os.path.abspath(__file__))
run_cmd("cp -r {}/* {}".format(package_dir, dirname)) run_cmd("cp -r {}/* {}".format(package_dir, dirname))
run_cmd("mkdir {}".format(os.path.join(dirname, "paddlerec"))) run_cmd("mkdir {}".format(os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec"))) run_cmd("mv {} {}".format(
run_cmd("mv {} {}".format(os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec"))) os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec"))) run_cmd("mv {} {}".format(
run_cmd("mv {} {}".format(os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec"))) os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec"))) run_cmd("mv {} {}".format(
run_cmd("mv {} {}".format(os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec"))) os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec")))
packages = find_packages(dirname, include=('paddlerec.*')) packages = find_packages(dirname, include=('paddlerec.*'))
package_dir = {'': dirname} package_dir = {'': dirname}
package_data = {} package_data = {}
models_copy = ['data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'tree/*.txt'] models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt'
]
engine_copy = ['*/*.sh'] engine_copy = ['*/*.sh']
for package in packages: for package in packages:
if package.startswith("paddlerec.models."): if package.startswith("paddlerec.models."):
...@@ -80,8 +84,7 @@ def build(dirname): ...@@ -80,8 +84,7 @@ def build(dirname):
package_data=package_data, package_data=package_data,
python_requires=">=2.7", python_requires=">=2.7",
install_requires=requires, install_requires=requires,
zip_safe=False zip_safe=False)
)
dirname = tempfile.mkdtemp() dirname = tempfile.mkdtemp()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env bash
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=================================================
# Utils
#=================================================
set -ex
function init() {
RED='\033[0;31m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NONE='\033[0m'
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
}
function check_style() {
set -e
export PATH=/usr/bin:$PATH
pre-commit install
if ! pre-commit run -a; then
git diff
exit 1
fi
exit 0
}
function main() {
local CMD=$1
init
case $CMD in
check_style)
check_style
;;
*)
echo "build failed"
exit 1
;;
esac
echo "check_style finished as expected"
}
main $@
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import io, re
import sys, os
import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
LANG_COMMENT_MARK = None
NEW_LINE_MARK = None
COPYRIGHT_HEADER = None
if platform.system() == "Windows":
NEW_LINE_MARK = "\r\n"
else:
NEW_LINE_MARK = '\n'
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
date, err = process.communicate()
date = date.decode("utf-8").rstrip("\n")
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
def generate_copyright(template, lang='C'):
if lang == 'Python':
LANG_COMMENT_MARK = '#'
else:
LANG_COMMENT_MARK = "//"
lines = template.split(NEW_LINE_MARK)
BLANK = " "
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
for lino, line in enumerate(lines):
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
if len(line) == 0:
BLANK = ""
else:
BLANK = " "
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
return ans + "\n"
def lang_type(filename):
if filename.endswith(".py"):
return "Python"
elif filename.endswith(".h"):
return "C"
elif filename.endswith(".c"):
return "C"
elif filename.endswith(".hpp"):
return "C"
elif filename.endswith(".cc"):
return "C"
elif filename.endswith(".cpp"):
return "C"
elif filename.endswith(".cu"):
return "C"
elif filename.endswith(".cuh"):
return "C"
elif filename.endswith(".go"):
return "C"
elif filename.endswith(".proto"):
return "C"
else:
print("Unsupported filetype %s", filename)
exit(0)
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
def main(argv=None):
parser = argparse.ArgumentParser(
description='Checker for copyright declaration.')
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
fd = io.open(filename, encoding="utf-8")
first_line = fd.readline()
second_line = fd.readline()
if "COPYRIGHT (C)" in first_line.upper(): continue
if first_line.startswith("#!") or PYTHON_ENCODE.match(
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
continue
original_contents = io.open(filename, encoding="utf-8").read()
new_contents = generate_copyright(
COPYRIGHT, lang_type(filename)) + original_contents
print('Auto Insert Copyright Header {}'.format(filename))
retv = 1
with io.open(filename, 'w') as output_file:
output_file.write(new_contents)
return retv
if __name__ == '__main__':
exit(main())
#!/bin/bash
TOTAL_ERRORS=0
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export PYTHONPATH=$DIR:$PYTHONPATH
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
pylint --disable=all --load-plugins=docstring_checker \
--enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done
exit $TOTAL_ERRORS
#For now, just warning:
#exit 0
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册