未验证 提交 633ecc2c 编写于 作者: D Dong Daxiang 提交者: GitHub

Merge pull request #9 from seiriosPlus/travis

Travis
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: v1.0.1
hooks:
- id: remove-crlf
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- id: end-of-file-fixer
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
language: generic
sudo: required
dist: trusty
services:
- docker
os:
- linux
env:
- JOB=check_style
before_install:
# For pylint dockstring checker
- sudo pip install pylint pytest astroid isort pre-commit
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- "travis_wait 30 sleep 1800 &"
- |
# 43min timeout
tools/build_script.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
notifications:
email:
on_success: change
on_failure: always
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -27,6 +27,7 @@ from paddlerec.core.utils import envs
class ClusterEngine(Engine):
def __init_impl__(self):
abs_dir = os.path.dirname(os.path.abspath(__file__))
backend = envs.get_runtime_environ("engine_backend")
if backend == "PaddleCloud":
self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
......@@ -57,4 +58,5 @@ class ClusterEngine(Engine):
self.start_worker_procs()
else:
raise ValueError("role {} error, must in MASTER/WORKER".format(role))
raise ValueError("role {} error, must in MASTER/WORKER".format(
role))
......@@ -46,10 +46,13 @@ class LocalClusterEngine(Engine):
ports.append(new_port)
break
user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
user_endpoints_ips = [x.split(":")[0]
for x in user_endpoints.split(",")]
user_endpoints_port = [x.split(":")[1]
for x in user_endpoints.split(",")]
user_endpoints_ips = [
x.split(":")[0] for x in user_endpoints.split(",")
]
user_endpoints_port = [
x.split(":")[1] for x in user_endpoints.split(",")
]
factory = "paddlerec.core.factory"
cmd = [sys.executable, "-u", "-m", factory, self.trainer]
......@@ -97,8 +100,10 @@ class LocalClusterEngine(Engine):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].terminate()
print("all workers already completed, you can view logs under the `{}` directory".format(logs_dir),
file=sys.stderr)
print(
"all workers already completed, you can view logs under the `{}` directory".
format(logs_dir),
file=sys.stderr)
def run(self):
self.start_procs()
......@@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine
class LocalMPIEngine(Engine):
def start_procs(self):
logs_dir = self.envs["log_dir"]
default_env = os.environ.copy()
current_env = copy.copy(default_env)
current_env.pop("http_proxy", None)
......@@ -42,7 +41,8 @@ class LocalMPIEngine(Engine):
os.system("mkdir -p {}".format(logs_dir))
fn = open("%s/job.log" % logs_dir, "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
proc = subprocess.Popen(
cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
else:
proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd())
procs.append(proc)
......@@ -51,7 +51,9 @@ class LocalMPIEngine(Engine):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].wait()
print("all workers and parameter servers already completed", file=sys.stderr)
print(
"all workers and parameter servers already completed",
file=sys.stderr)
def run(self):
self.start_procs()
......@@ -19,24 +19,23 @@ import yaml
from paddlerec.core.utils import envs
trainer_abs = os.path.join(os.path.dirname(
os.path.abspath(__file__)), "trainers")
trainer_abs = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "trainers")
trainers = {}
def trainer_registry():
trainers["SingleTrainer"] = os.path.join(
trainer_abs, "single_trainer.py")
trainers["ClusterTrainer"] = os.path.join(
trainer_abs, "cluster_trainer.py")
trainers["CtrCodingTrainer"] = os.path.join(
trainer_abs, "ctr_coding_trainer.py")
trainers["CtrModulTrainer"] = os.path.join(
trainer_abs, "ctr_modul_trainer.py")
trainers["TDMSingleTrainer"] = os.path.join(
trainer_abs, "tdm_single_trainer.py")
trainers["TDMClusterTrainer"] = os.path.join(
trainer_abs, "tdm_cluster_trainer.py")
trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py")
trainers["ClusterTrainer"] = os.path.join(trainer_abs,
"cluster_trainer.py")
trainers["CtrCodingTrainer"] = os.path.join(trainer_abs,
"ctr_coding_trainer.py")
trainers["CtrModulTrainer"] = os.path.join(trainer_abs,
"ctr_modul_trainer.py")
trainers["TDMSingleTrainer"] = os.path.join(trainer_abs,
"tdm_single_trainer.py")
trainers["TDMClusterTrainer"] = os.path.join(trainer_abs,
"tdm_cluster_trainer.py")
trainer_registry()
......@@ -55,8 +54,8 @@ class TrainerFactory(object):
if trainer_abs is None:
if not os.path.isfile(train_mode):
raise IOError(
"trainer {} can not be recognized".format(train_mode))
raise IOError("trainer {} can not be recognized".format(
train_mode))
trainer_abs = train_mode
train_mode = "UserDefineTrainer"
......
......@@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric
class AUCMetric(Metric):
"""
Metric For Paddle Model
Metric For Fluid Model
"""
def __init__(self, config, fleet):
......@@ -83,7 +83,8 @@ class AUCMetric(Metric):
if scope.find_var(metric_item['var'].name) is None:
result[metric_name] = None
continue
result[metric_name] = self.get_metric(scope, metric_item['var'].name)
result[metric_name] = self.get_metric(scope,
metric_item['var'].name)
return result
def calculate_auc(self, global_pos, global_neg):
......@@ -178,14 +179,18 @@ class AUCMetric(Metric):
self._result['mean_q'] = 0
return self._result
if 'stat_pos' in result and 'stat_neg' in result:
result['auc'] = self.calculate_auc(result['stat_pos'], result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'], result['stat_neg'])
result['auc'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
if 'pos_ins_num' in result:
result['actual_ctr'] = result['pos_ins_num'] / result['total_ins_num']
result['actual_ctr'] = result['pos_ins_num'] / result[
'total_ins_num']
if 'abserr' in result:
result['mae'] = result['abserr'] / result['total_ins_num']
if 'sqrerr' in result:
result['rmse'] = math.sqrt(result['sqrerr'] / result['total_ins_num'])
result['rmse'] = math.sqrt(result['sqrerr'] /
result['total_ins_num'])
if 'prob' in result:
result['predict_ctr'] = result['prob'] / result['total_ins_num']
if abs(result['predict_ctr']) > 1e-6:
......
......@@ -20,7 +20,7 @@ from paddlerec.core.utils import envs
class Model(object):
"""R
"""Base Model
"""
__metaclass__ = abc.ABCMeta
......@@ -39,32 +39,43 @@ class Model(object):
self._platform = envs.get_platform()
def _init_slots(self):
sparse_slots = envs.get_global_env("sparse_slots", None, "train.reader")
sparse_slots = envs.get_global_env("sparse_slots", None,
"train.reader")
dense_slots = envs.get_global_env("dense_slots", None, "train.reader")
if sparse_slots is not None or dense_slots is not None:
sparse_slots = sparse_slots.strip().split(" ")
dense_slots = dense_slots.strip().split(" ")
dense_slots_shape = [[int(j) for j in i.split(":")[1].strip("[]").split(",")] for i in dense_slots]
dense_slots_shape = [[
int(j) for j in i.split(":")[1].strip("[]").split(",")
] for i in dense_slots]
dense_slots = [i.split(":")[0] for i in dense_slots]
self._dense_data_var = []
for i in range(len(dense_slots)):
l = fluid.layers.data(name=dense_slots[i], shape=dense_slots_shape[i], dtype="float32")
l = fluid.layers.data(
name=dense_slots[i],
shape=dense_slots_shape[i],
dtype="float32")
self._data_var.append(l)
self._dense_data_var.append(l)
self._sparse_data_var = []
for name in sparse_slots:
l = fluid.layers.data(name=name, shape=[1], lod_level=1, dtype="int64")
l = fluid.layers.data(
name=name, shape=[1], lod_level=1, dtype="int64")
self._data_var.append(l)
self._sparse_data_var.append(l)
dataset_class = envs.get_global_env("dataset_class", None, "train.reader")
dataset_class = envs.get_global_env("dataset_class", None,
"train.reader")
if dataset_class == "DataLoader":
self._init_dataloader()
def _init_dataloader(self):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_inputs(self):
return self._data_var
......@@ -96,8 +107,8 @@ class Model(object):
"configured optimizer can only supported SGD/Adam/Adagrad")
if name == "SGD":
reg = envs.get_global_env(
"hyper_parameters.reg", 0.0001, self._namespace)
reg = envs.get_global_env("hyper_parameters.reg", 0.0001,
self._namespace)
optimizer_i = fluid.optimizer.SGD(
lr, regularization=fluid.regularizer.L2DecayRegularizer(reg))
elif name == "ADAM":
......@@ -111,10 +122,10 @@ class Model(object):
return optimizer_i
def optimizer(self):
learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate", None, self._namespace)
optimizer = envs.get_global_env(
"hyper_parameters.optimizer", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
self._namespace)
print(">>>>>>>>>>>.learnig rate: %s" % learning_rate)
return self._build_optimizer(optimizer, learning_rate)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -31,6 +31,7 @@ def create(config):
Model Instance
"""
model = None
if config['mode'] == 'fluid':
model = YamlModel(config)
model.train_net()
......@@ -50,7 +51,12 @@ class YamlModel(Model):
f = open(config['layer_file'], 'r')
self._build_nodes = yaml.safe_load(f.read())
self._build_phase = ['input', 'param', 'summary', 'layer']
self._build_param = {'layer': {}, 'inner_layer': {}, 'layer_extend': {}, 'model': {}}
self._build_param = {
'layer': {},
'inner_layer': {},
'layer_extend': {},
'model': {}
}
self._inference_meta = {'dependency': {}, 'params': {}}
def train_net(self):
......@@ -76,10 +82,12 @@ class YamlModel(Model):
if self._build_nodes[phase] is None:
continue
for node in self._build_nodes[phase]:
exec("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(self._config['mode'], self._build_param)
exec ("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(
self._config['mode'], self._build_param)
self._build_param['layer'][node['name']] = layer_output
self._build_param['layer_extend'][node['name']] = extend_output
self._build_param['layer_extend'][node[
'name']] = extend_output
if extend_output is None:
continue
if 'loss' in extend_output:
......@@ -89,17 +97,24 @@ class YamlModel(Model):
self._cost += extend_output['loss']
if 'data_var' in extend_output:
self._data_var += extend_output['data_var']
if 'metric_label' in extend_output and extend_output['metric_label'] is not None:
self._metrics[extend_output['metric_label']] = extend_output['metric_dict']
if 'metric_label' in extend_output and extend_output[
'metric_label'] is not None:
self._metrics[extend_output[
'metric_label']] = extend_output['metric_dict']
if 'inference_param' in extend_output:
inference_param = extend_output['inference_param']
param_name = inference_param['name']
if param_name not in self._build_param['table']:
self._build_param['table'][param_name] = {'params': []}
table_meta = table.TableMeta.alloc_new_table(inference_param['table_id'])
self._build_param['table'][param_name]['_meta'] = table_meta
self._build_param['table'][param_name]['params'] += inference_param['params']
self._build_param['table'][param_name] = {
'params': []
}
table_meta = table.TableMeta.alloc_new_table(
inference_param['table_id'])
self._build_param['table'][param_name][
'_meta'] = table_meta
self._build_param['table'][param_name][
'params'] += inference_param['params']
pass
@classmethod
......@@ -114,20 +129,25 @@ class YamlModel(Model):
metrics = params['metrics']
for name in metrics:
model_metrics = metrics[name]
stat_var_names += [model_metrics[metric]['var'].name for metric in model_metrics]
stat_var_names += [
model_metrics[metric]['var'].name
for metric in model_metrics
]
strategy['stat_var_names'] = list(set(stat_var_names))
optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \
'(learning_rate=' + str(optimizer_conf['learning_rate']) + ')'
exec(optimizer_generator)
exec (optimizer_generator)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
return optimizer
def dump_model_program(self, path):
"""R
"""
with open(path + '/' + self._name + '_main_program.pbtxt', "w") as fout:
with open(path + '/' + self._name + '_main_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['train_program']
with open(path + '/' + self._name + '_startup_program.pbtxt', "w") as fout:
with open(path + '/' + self._name + '_startup_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['startup_program']
pass
......@@ -137,7 +157,8 @@ class YamlModel(Model):
scope = params['scope']
decay = params['decay']
for param_table in self._build_param['table']:
table_id = self._build_param['table'][param_table]['_meta']._table_id
table_id = self._build_param['table'][param_table][
'_meta']._table_id
fleet.shrink_dense_table(decay, scope=scope, table_id=table_id)
def dump_inference_program(self, inference_layer, path):
......@@ -152,17 +173,25 @@ class YamlModel(Model):
executor = params['executor']
program = self._build_param['model']['train_program']
for table_name, table in self._build_param['table'].items():
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, table['params'])
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id,
table['params'])
for infernce_item in params['inference_list']:
params_name_list = self.inference_params(infernce_item['layer_name'])
params_var_list = [program.global_block().var(i) for i in params_name_list]
params_name_list = self.inference_params(infernce_item[
'layer_name'])
params_var_list = [
program.global_block().var(i) for i in params_name_list
]
params_file_name = infernce_item['save_file_name']
with fluid.scope_guard(scope):
if params['save_combine']:
fluid.io.save_vars(executor, "./", \
program, vars=params_var_list, filename=params_file_name)
else:
fluid.io.save_vars(executor, params_file_name, program, vars=params_var_list)
fluid.io.save_vars(
executor,
params_file_name,
program,
vars=params_var_list)
def inference_params(self, inference_layer):
"""
......@@ -177,11 +206,13 @@ class YamlModel(Model):
return self._inference_meta['params'][layer]
self._inference_meta['params'][layer] = []
self._inference_meta['dependency'][layer] = self.get_dependency(self._build_param['inner_layer'], layer)
self._inference_meta['dependency'][layer] = self.get_dependency(
self._build_param['inner_layer'], layer)
for node in self._build_nodes['layer']:
if node['name'] not in self._inference_meta['dependency'][layer]:
continue
if 'inference_param' in self._build_param['layer_extend'][node['name']]:
if 'inference_param' in self._build_param['layer_extend'][node[
'name']]:
self._inference_meta['params'][layer] += \
self._build_param['layer_extend'][node['name']]['inference_param']['params']
return self._inference_meta['params'][layer]
......@@ -199,5 +230,6 @@ class YamlModel(Model):
dependencys = copy.deepcopy(layer_graph[dest_layer]['input'])
dependency_list = copy.deepcopy(dependencys)
for dependency in dependencys:
dependency_list = dependency_list + self.get_dependency(layer_graph, dependency)
dependency_list = dependency_list + self.get_dependency(
layer_graph, dependency)
return list(set(dependency_list))
......@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer
class EmbeddingFuseLayer(Layer):
"""R
"""embedding + sequence + concat
"""
def __init__(self, config):
......@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer):
show_clk.stop_gradient = True
data_var = []
for slot in self._slots:
l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1)
l = fluid.layers.data(
name=slot, shape=[1], dtype="int64", lod_level=1)
data_var.append(l)
emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \
is_sparse=True, is_distributed=True,
......@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer):
emb = fluid.layers.sequence_pool(input=emb, pool_type='sum')
emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm)
self._emb_layers.append(emb)
output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name)
output = fluid.layers.concat(
input=self._emb_layers, axis=1, name=self._name)
return output, {'data_var': data_var}
......@@ -111,7 +113,13 @@ class ParamLayer(Layer):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}}
return self._config, {
'inference_param': {
'name': 'param',
'params': [],
'table_id': self._table_id
}
}
class SummaryLayer(Layer):
......@@ -129,7 +137,13 @@ class SummaryLayer(Layer):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}}
return self._config, {
'inference_param': {
'name': 'summary',
'params': [],
'table_id': self._table_id
}
}
class NormalizationLayer(Layer):
......@@ -152,9 +166,19 @@ class NormalizationLayer(Layer):
if len(self._input) > 0:
input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1)
bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={
"batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4})
inference_param = [self._name + '.batch_size', self._name + '.batch_sum', self._name + '.batch_square_sum']
bn = fluid.layers.data_norm(
input=input_layer,
name=self._name,
epsilon=1e-4,
param_attr={
"batch_size": 1e4,
"batch_sum_default": 0.0,
"batch_square": 1e4
})
inference_param = [
self._name + '.batch_size', self._name + '.batch_sum',
self._name + '.batch_square_sum'
]
return bn, {'inference_param': {'name': 'summary', \
'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}}
......@@ -181,11 +205,13 @@ class FCLayer(Layer):
input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1)
input_coln = input_layer.shape[1]
scale = param_layer['init_range'] / (input_coln ** 0.5)
scale = param_layer['init_range'] / (input_coln**0.5)
bias = None
if self._bias:
bias = fluid.ParamAttr(learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale))
bias = fluid.ParamAttr(
learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale))
fc = fluid.layers.fc(
name=self._name,
input=input_layer,
......@@ -216,18 +242,46 @@ class LogLossLayer(Layer):
self._extend_output = {
'metric_label': self._metric_label,
'metric_dict': {
'auc': {'var': None},
'batch_auc': {'var': None},
'stat_pos': {'var': None, 'data_type': 'int64'},
'stat_neg': {'var': None, 'data_type': 'int64'},
'batch_stat_pos': {'var': None, 'data_type': 'int64'},
'batch_stat_neg': {'var': None, 'data_type': 'int64'},
'pos_ins_num': {'var': None},
'abserr': {'var': None},
'sqrerr': {'var': None},
'prob': {'var': None},
'total_ins_num': {'var': None},
'q': {'var': None}
'auc': {
'var': None
},
'batch_auc': {
'var': None
},
'stat_pos': {
'var': None,
'data_type': 'int64'
},
'stat_neg': {
'var': None,
'data_type': 'int64'
},
'batch_stat_pos': {
'var': None,
'data_type': 'int64'
},
'batch_stat_neg': {
'var': None,
'data_type': 'int64'
},
'pos_ins_num': {
'var': None
},
'abserr': {
'var': None
},
'sqrerr': {
'var': None
},
'prob': {
'var': None
},
'total_ins_num': {
'var': None
},
'q': {
'var': None
}
}
}
......@@ -236,9 +290,12 @@ class LogLossLayer(Layer):
"""
input_layer = param['layer'][self._input[0]]
label_layer = param['layer'][self._label]
output = fluid.layers.clip(input_layer, self._bound[0], self._bound[1], name=self._name)
output = fluid.layers.clip(
input_layer, self._bound[0], self._bound[1], name=self._name)
norm = fluid.layers.sigmoid(output, name=self._name)
output = fluid.layers.log_loss(norm, fluid.layers.cast(x=label_layer, dtype='float32'))
output = fluid.layers.log_loss(
norm, fluid.layers.cast(
x=label_layer, dtype='float32'))
if self._weight:
weight_layer = param['layer'][self._weight]
output = fluid.layers.elementwise_mul(output, weight_layer)
......@@ -248,7 +305,11 @@ class LogLossLayer(Layer):
# For AUC Metric
metric = self._extend_output['metric_dict']
binary_predict = fluid.layers.concat(
input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1)
input=[
fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm),
norm
],
axis=1)
metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \
metric['batch_stat_neg']['var'], metric['stat_pos']['var'],
metric['stat_neg']['var']] = \
......
......@@ -11,9 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import abc
import os
......@@ -64,7 +64,11 @@ class SlotReader(dg.MultiSlotDataGenerator):
from operator import mul
self.sparse_slots = sparse_slots.strip().split(" ")
self.dense_slots = dense_slots.strip().split(" ")
self.dense_slots_shape = [reduce(mul, [int(j) for j in i.split(":")[1].strip("[]").split(",")]) for i in self.dense_slots]
self.dense_slots_shape = [
reduce(mul,
[int(j) for j in i.split(":")[1].strip("[]").split(",")])
for i in self.dense_slots
]
self.dense_slots = [i.split(":")[0] for i in self.dense_slots]
self.slots = self.dense_slots + self.sparse_slots
self.slot2index = {}
......@@ -93,10 +97,13 @@ class SlotReader(dg.MultiSlotDataGenerator):
slot = i
if not self.visit[slot]:
if i in self.dense_slots:
output[self.slot2index[i]][1].extend([self.padding] * self.dense_slots_shape[self.slot2index[i]])
output[self.slot2index[i]][1].extend(
[self.padding] *
self.dense_slots_shape[self.slot2index[i]])
else:
output[self.slot2index[i]][1].extend([self.padding])
else:
self.visit[slot] = False
yield output
return reader
......@@ -30,8 +30,10 @@ class Trainer(object):
def __init__(self, config=None):
self._status_processor = {}
self._place = fluid.CPUPlace()
self._exe = fluid.Executor(self._place)
self._exector_context = {}
self._context = {'status': 'uninit', 'is_exit': False}
self._config_yaml = config
......@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml):
train_dirname = os.path.dirname(train_location)
base_name = os.path.splitext(os.path.basename(train_location))[0]
sys.path.append(train_dirname)
trainer_class = envs.lazy_instance_by_fliename(
base_name, "UserDefineTraining")
trainer_class = envs.lazy_instance_by_fliename(base_name,
"UserDefineTraining")
return trainer_class
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
......@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer
"""
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor(
'train_pass', self.dataloader_train)
self.regist_context_processor('train_pass',
self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal)
......@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer):
def init(self, context):
self.model.train_net()
optimizer = self.model.optimizer()
optimizer_name = envs.get_global_env(
"hyper_parameters.optimizer", None, "train.model")
optimizer_name = envs.get_global_env("hyper_parameters.optimizer",
None, "train.model")
if optimizer_name not in ["", "sgd", "SGD", "Sgd"]:
os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0'
......@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer):
program = fluid.compiler.CompiledProgram(
fleet.main_program).with_data_parallel(
loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy())
loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy())
metrics_varnames = []
metrics_format = []
......@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer):
for i in range(epochs):
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True)
fleet.stop_worker()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddlerec.core.utils import envs
from paddlerec.core.trainer import Trainer
class CtrTrainer(Trainer):
"""R
"""
def __init__(self, config):
"""R
"""
Trainer.__init__(self, config)
self.global_config = config
self._metrics = {}
self.processor_register()
def processor_register(self):
role = MPISymetricRoleMaker()
fleet.init(role)
if fleet.is_server():
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('server_pass', self.server)
else:
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('train_pass', self.train)
self.regist_context_processor('terminal_pass', self.terminal)
def _get_dataset(self):
namespace = "train.reader"
inputs = self.model.get_inputs()
threads = envs.get_global_env("train.threads", None)
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
dataset.set_pipe_command(pipe_cmd)
dataset.set_batch_size(batch_size)
dataset.set_thread(threads)
file_list = [
os.path.join(train_data_path, x)
for x in os.listdir(train_data_path)
]
dataset.set_filelist(file_list)
return dataset
def instance(self, context):
models = envs.get_global_env("train.model.models")
model_class = envs.lazy_instance_by_fliename(models, "Model")
self.model = model_class(None)
context['status'] = 'init_pass'
def init(self, context):
"""R
"""
self.model.train_net()
optimizer = self.model.optimizer()
optimizer = fleet.distributed_optimizer(
optimizer, strategy={"use_cvm": False})
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
context['status'] = 'server_pass'
else:
self.fetch_vars = []
self.fetch_alias = []
self.fetch_period = self.model.get_fetch_period()
metrics = self.model.get_metrics()
if metrics:
self.fetch_vars = metrics.values()
self.fetch_alias = metrics.keys()
context['status'] = 'train_pass'
def server(self, context):
fleet.run_server()
fleet.stop_worker()
context['is_exit'] = True
def train(self, context):
self._exe.run(fluid.default_startup_program())
fleet.init_worker()
dataset = self._get_dataset()
shuf = np.array([fleet.worker_index()])
gs = shuf * 0
fleet._role_maker._node_type_comm.Allreduce(shuf, gs)
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(
), fleet.worker_num(), gs))
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
context['status'] = 'terminal_pass'
fleet.stop_worker()
def terminal(self, context):
print("terminal ended.")
context['is_exit'] = True
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import json
import sys
import time
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
from paddlerec.core.utils import fs as fs
from paddlerec.core.utils import util as util
from paddlerec.core.metrics.auc_metrics import AUCMetric
from paddlerec.core.modules.modul import build as model_basic
from paddlerec.core.utils import dataset
from paddlerec.core.trainer import Trainer
def wroker_numric_opt(value, env, opt):
"""
numric count opt for workers
Args:
value: value for count
env: mpi/gloo
opt: count operator, SUM/MAX/MIN/AVG
Return:
count result
"""
local_value = np.array([value])
global_value = np.copy(local_value) * 0
fleet._role_maker.all_reduce_worker(local_value, global_value, opt)
return global_value[0]
def worker_numric_sum(value, env="mpi"):
"""R
"""
return wroker_numric_opt(value, env, "sum")
def worker_numric_avg(value, env="mpi"):
"""R
"""
return worker_numric_sum(value, env) / fleet.worker_num()
def worker_numric_min(value, env="mpi"):
"""R
"""
return wroker_numric_opt(value, env, "min")
def worker_numric_max(value, env="mpi"):
"""R
"""
return wroker_numric_opt(value, env, "max")
class CtrTrainer(Trainer):
"""R
"""
def __init__(self, config):
"""R
"""
Trainer.__init__(self, config)
config['output_path'] = util.get_absolute_path(config['output_path'],
config['io']['afs'])
self.global_config = config
self._metrics = {}
self._path_generator = util.PathGenerator({
'templates': [{
'name': 'xbox_base_done',
'template': config['output_path'] + '/xbox_base_done.txt'
}, {
'name': 'xbox_delta_done',
'template': config['output_path'] + '/xbox_patch_done.txt'
}, {
'name': 'xbox_base',
'template': config['output_path'] + '/xbox/{day}/base/'
}, {
'name': 'xbox_delta',
'template':
config['output_path'] + '/xbox/{day}/delta-{pass_id}/'
}, {
'name': 'batch_model',
'template':
config['output_path'] + '/batch_model/{day}/{pass_id}/'
}]
})
if 'path_generator' in config:
self._path_generator.add_path_template(config['path_generator'])
self.regist_context_processor('uninit', self.init)
self.regist_context_processor('startup', self.startup)
self.regist_context_processor('begin_day', self.begin_day)
self.regist_context_processor('train_pass', self.train_pass)
self.regist_context_processor('end_day', self.end_day)
def init(self, context):
"""R
"""
role_maker = None
if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu':
afs_config = self.global_config['io']['afs']
role_maker = GeneralRoleMaker(
hdfs_name=afs_config['fs_name'],
hdfs_ugi=afs_config['fs_ugi'],
path=self.global_config['output_path'] + "/gloo",
init_timeout_seconds=1200,
run_timeout_seconds=1200)
fleet.init(role_maker)
data_var_list = []
data_var_name_dict = {}
runnnable_scope = []
runnnable_cost_op = []
context['status'] = 'startup'
for executor in self.global_config['executor']:
scope = fluid.Scope()
self._exector_context[executor['name']] = {}
self._exector_context[executor['name']]['scope'] = scope
self._exector_context[executor['name']][
'model'] = model_basic.create(executor)
model = self._exector_context[executor['name']]['model']
self._metrics.update(model.get_metrics())
runnnable_scope.append(scope)
runnnable_cost_op.append(model.get_avg_cost())
for var in model._data_var:
if var.name in data_var_name_dict:
continue
data_var_list.append(var)
data_var_name_dict[var.name] = var
optimizer = model_basic.YamlModel.build_optimizer({
'metrics': self._metrics,
'optimizer_conf': self.global_config['optimizer']
})
optimizer.minimize(runnnable_cost_op, runnnable_scope)
for executor in self.global_config['executor']:
scope = self._exector_context[executor['name']]['scope']
model = self._exector_context[executor['name']]['model']
program = model._build_param['model']['train_program']
if not executor['is_update_sparse']:
program._fleet_opt["program_configs"][str(
id(model.get_avg_cost().block.program))][
"push_sparse"] = []
if 'train_thread_num' not in executor:
executor['train_thread_num'] = self.global_config[
'train_thread_num']
with fluid.scope_guard(scope):
self._exe.run(model._build_param['model']['startup_program'])
model.dump_model_program('./')
# server init done
if fleet.is_server():
return 0
self._dataset = {}
for dataset_item in self.global_config['dataset']['data_list']:
dataset_item['data_vars'] = data_var_list
dataset_item.update(self.global_config['io']['afs'])
dataset_item["batch_size"] = self.global_config['batch_size']
self._dataset[dataset_item[
'name']] = dataset.FluidTimeSplitDataset(dataset_item)
# if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
# util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3)
fleet.init_worker()
pass
def print_log(self, log_str, params):
"""R
"""
params['index'] = fleet.worker_index()
if params['master']:
if fleet.worker_index() == 0:
print(log_str)
sys.stdout.flush()
else:
print(log_str)
if 'stdout' in params:
params['stdout'] += str(datetime.datetime.now()) + log_str
def print_global_metrics(self, scope, model, monitor_data, stdout_str):
"""R
"""
metrics = model.get_metrics()
metric_calculator = AUCMetric(None)
for metric in metrics:
metric_param = {'label': metric, 'metric_dict': metrics[metric]}
metric_calculator.calculate(scope, metric_param)
metric_result = metric_calculator.get_result_to_string()
self.print_log(metric_result,
{'master': True,
'stdout': stdout_str})
monitor_data += metric_result
metric_calculator.clear(scope, metric_param)
def save_model(self, day, pass_index, base_key):
"""R
"""
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save model cost %s sec'
})
model_path = self._path_generator.generate_path(
'batch_model', {'day': day,
'pass_id': pass_index})
save_mode = 0 # just save all
if pass_index < 1: # batch_model
save_mode = 3 # unseen_day++, save all
util.rank0_print("going to save_model %s" % model_path)
fleet.save_persistables(None, model_path, mode=save_mode)
if fleet._role_maker.is_first_worker():
self._train_pass.save_train_progress(
day, pass_index, base_key, model_path, is_checkpoint=True)
cost_printer.done()
return model_path
def save_xbox_model(self, day, pass_index, xbox_base_key, monitor_data):
"""R
"""
stdout_str = ""
xbox_patch_id = str(int(time.time()))
util.rank0_print("begin save delta model")
model_path = ""
xbox_model_donefile = ""
cost_printer = util.CostPrinter(util.print_cost, {'master': True, \
'log_format': 'save xbox model cost %s sec',
'stdout': stdout_str})
if pass_index < 1:
save_mode = 2
xbox_patch_id = xbox_base_key
model_path = self._path_generator.generate_path('xbox_base',
{'day': day})
xbox_model_donefile = self._path_generator.generate_path(
'xbox_base_done', {'day': day})
else:
save_mode = 1
model_path = self._path_generator.generate_path(
'xbox_delta', {'day': day,
'pass_id': pass_index})
xbox_model_donefile = self._path_generator.generate_path(
'xbox_delta_done', {'day': day})
total_save_num = fleet.save_persistables(
None, model_path, mode=save_mode)
cost_printer.done()
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save cache model cost %s sec',
'stdout': stdout_str
})
model_file_handler = fs.FileHandler(self.global_config['io']['afs'])
if self.global_config['save_cache_model']:
cache_save_num = fleet.save_cache_model(
None, model_path, mode=save_mode)
model_file_handler.write(
"file_prefix:part\npart_num:16\nkey_num:%d\n" % cache_save_num,
model_path + '/000_cache/sparse_cache.meta', 'w')
cost_printer.done()
util.rank0_print("save xbox cache model done, key_num=%s" %
cache_save_num)
save_env_param = {'executor': self._exe, 'save_combine': True}
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save dense model cost %s sec',
'stdout': stdout_str
})
if fleet._role_maker.is_first_worker():
for executor in self.global_config['executor']:
if 'layer_for_inference' not in executor:
continue
executor_name = executor['name']
model = self._exector_context[executor_name]['model']
save_env_param['inference_list'] = executor[
'layer_for_inference']
save_env_param['scope'] = self._exector_context[executor_name][
'scope']
model.dump_inference_param(save_env_param)
for dnn_layer in executor['layer_for_inference']:
model_file_handler.cp(dnn_layer['save_file_name'],
model_path + '/dnn_plugin/' +
dnn_layer['save_file_name'])
fleet._role_maker._barrier_worker()
cost_printer.done()
xbox_done_info = {
"id": xbox_patch_id,
"key": xbox_base_key,
"ins_path": "",
"ins_tag": "feasign",
"partition_type": "2",
"record_count": "111111",
"monitor_data": monitor_data,
"mpi_size": str(fleet.worker_num()),
"input": model_path.rstrip("/") + "/000",
"job_id": util.get_env_value("JOB_ID"),
"job_name": util.get_env_value("JOB_NAME")
}
if fleet._role_maker.is_first_worker():
model_file_handler.write(
json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a')
if pass_index > 0:
self._train_pass.save_train_progress(
day,
pass_index,
xbox_base_key,
model_path,
is_checkpoint=False)
fleet._role_maker._barrier_worker()
return stdout_str
def run_executor(self, executor_config, dataset, stdout_str):
"""R
"""
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
xbox_base_key = self._train_pass._base_key
executor_name = executor_config['name']
scope = self._exector_context[executor_name]['scope']
model = self._exector_context[executor_name]['model']
with fluid.scope_guard(scope):
util.rank0_print("Begin " + executor_name + " pass")
begin = time.time()
program = model._build_param['model']['train_program']
self._exe.train_from_dataset(
program,
dataset,
scope,
thread=executor_config['train_thread_num'],
debug=self.global_config['debug'])
end = time.time()
local_cost = (end - begin) / 60.0
avg_cost = worker_numric_avg(local_cost)
min_cost = worker_numric_min(local_cost)
max_cost = worker_numric_max(local_cost)
util.rank0_print("avg train time %s mins, min %s mins, max %s mins"
% (avg_cost, min_cost, max_cost))
self._exector_context[executor_name]['cost'] = max_cost
monitor_data = ""
self.print_global_metrics(scope, model, monitor_data, stdout_str)
util.rank0_print("End " + executor_name + " pass")
if self._train_pass.need_dump_inference(
pass_id) and executor_config['dump_inference_model']:
stdout_str += self.save_xbox_model(day, pass_id, xbox_base_key,
monitor_data)
fleet._role_maker._barrier_worker()
def startup(self, context):
"""R
"""
if fleet.is_server():
fleet.run_server()
context['status'] = 'wait'
return
stdout_str = ""
self._train_pass = util.TimeTrainPass(self.global_config)
if not self.global_config['cold_start']:
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'load model cost %s sec',
'stdout': stdout_str
})
self.print_log("going to load model %s" %
self._train_pass._checkpoint_model_path,
{'master': True})
# if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= self._train_pass.date()
# and config.reqi_dnn_plugin_pass >= self._pass_id:
# fleet.load_one_table(0, self._train_pass._checkpoint_model_path)
# else:
fleet.init_server(self._train_pass._checkpoint_model_path, mode=0)
cost_printer.done()
if self.global_config['save_first_base']:
self.print_log("save_first_base=True", {'master': True})
self.print_log("going to save xbox base model",
{'master': True,
'stdout': stdout_str})
self._train_pass._base_key = int(time.time())
stdout_str += self.save_xbox_model(self._train_pass.date(), 0,
self._train_pass._base_key, "")
context['status'] = 'begin_day'
def begin_day(self, context):
"""R
"""
stdout_str = ""
if not self._train_pass.next():
context['is_exit'] = True
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
self.print_log("======== BEGIN DAY:%s ========" % day,
{'master': True,
'stdout': stdout_str})
if pass_id == self._train_pass.max_pass_num_day():
context['status'] = 'end_day'
else:
context['status'] = 'train_pass'
def end_day(self, context):
"""R
"""
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
xbox_base_key = int(time.time())
context['status'] = 'begin_day'
util.rank0_print("shrink table")
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'shrink table done, cost %s sec'
})
fleet.shrink_sparse_table()
for executor in self._exector_context:
self._exector_context[executor]['model'].shrink({
'scope': self._exector_context[executor]['scope'],
'decay': self.global_config['optimizer']['dense_decay_rate']
})
cost_printer.done()
next_date = self._train_pass.date(delta_day=1)
util.rank0_print("going to save xbox base model")
self.save_xbox_model(next_date, 0, xbox_base_key, "")
util.rank0_print("going to save batch model")
self.save_model(next_date, 0, xbox_base_key)
self._train_pass._base_key = xbox_base_key
fleet._role_maker._barrier_worker()
def train_pass(self, context):
"""R
"""
stdout_str = ""
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
base_key = self._train_pass._base_key
pass_time = self._train_pass._current_train_time.strftime("%Y%m%d%H%M")
self.print_log(" ==== begin delta:%s ========" % pass_id,
{'master': True,
'stdout': stdout_str})
train_begin_time = time.time()
cost_printer = util.CostPrinter(util.print_cost, \
{'master': True, 'log_format': 'load into memory done, cost %s sec',
'stdout': stdout_str})
current_dataset = {}
for name in self._dataset:
current_dataset[name] = self._dataset[name].load_dataset({
'node_num': fleet.worker_num(),
'node_idx': fleet.worker_index(),
'begin_time': pass_time,
'time_window_min': self._train_pass._interval_per_pass
})
fleet._role_maker._barrier_worker()
cost_printer.done()
util.rank0_print("going to global shuffle")
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'stdout': stdout_str,
'log_format': 'global shuffle done, cost %s sec'
})
for name in current_dataset:
current_dataset[name].global_shuffle(
fleet, self.global_config['dataset']['shuffle_thread'])
cost_printer.done()
# str(dataset.get_shuffle_data_size(fleet))
fleet._role_maker._barrier_worker()
if self.global_config['prefetch_data']:
next_pass_time = (
self._train_pass._current_train_time + datetime.timedelta(
minutes=self._train_pass._interval_per_pass)
).strftime("%Y%m%d%H%M")
for name in self._dataset:
self._dataset[name].preload_dataset({
'node_num': fleet.worker_num(),
'node_idx': fleet.worker_index(),
'begin_time': next_pass_time,
'time_window_min': self._train_pass._interval_per_pass
})
fleet._role_maker._barrier_worker()
pure_train_begin = time.time()
for executor in self.global_config['executor']:
self.run_executor(executor,
current_dataset[executor['dataset_name']],
stdout_str)
cost_printer = util.CostPrinter(util.print_cost, \
{'master': True, 'log_format': 'release_memory cost %s sec'})
for name in current_dataset:
current_dataset[name].release_memory()
pure_train_cost = time.time() - pure_train_begin
if self._train_pass.is_checkpoint_pass(pass_id):
self.save_model(day, pass_id, base_key)
train_end_time = time.time()
train_cost = train_end_time - train_begin_time
other_cost = train_cost - pure_train_cost
log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % (
day, pass_id, train_cost)
for executor in self._exector_context:
log_str += '[' + executor + ':' + str(self._exector_context[
executor]['cost']) + ']'
log_str += '[other_cost:' + str(other_cost) + ']'
util.rank0_print(log_str)
stdout_str += util.now_time_str() + log_str
sys.stdout.write(stdout_str)
fleet._role_maker._barrier_worker()
stdout_str = ""
if pass_id == self._train_pass.max_pass_num_day():
context['status'] = 'end_day'
return
elif not self._train_pass.next():
context['is_exit'] = True
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor(
'train_pass', self.dataloader_train)
self.regist_context_processor('train_pass',
self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal)
......@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer):
if state == "TRAIN":
inputs = self.model.get_inputs()
namespace = "train.reader"
train_data_path = envs.get_global_env(
"train_data_path", None, namespace)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
else:
inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader"
train_data_path = envs.get_global_env(
"test_data_path", None, namespace)
train_data_path = envs.get_global_env("test_data_path", None,
namespace)
threads = int(envs.get_runtime_environ("train.trainer.threads"))
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(
reader, reader_class, state, self._config_yaml)
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
self._config_yaml)
if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
train_data_path = os.path.join(
package_base, train_data_path.split("::")[1])
train_data_path = os.path.join(package_base,
train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer):
ins = self._get_dataset_ins()
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True)
fleet.stop_worker()
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None,
"train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor('train_pass', self.dataloader_train)
......@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer):
reader = self._get_dataloader("TRAIN")
epochs = envs.get_global_env("train.epochs")
program = fluid.compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=self.model.get_avg_cost().name)
program = fluid.compiler.CompiledProgram(fluid.default_main_program(
)).with_data_parallel(loss_name=self.model.get_avg_cost().name)
metrics_varnames = []
metrics_format = []
......@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer):
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times))
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=False)
context['status'] = 'infer_pass'
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"]
class TDMClusterTrainer(ClusterTrainer):
def server(self, context):
namespace = "train.startup"
init_model_path = envs.get_global_env(
"cluster.init_model_path", "", namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
assert init_model_path != "", "Cluster train must has init_model for TDM"
fleet.init_server(init_model_path)
logger.info("TDM: load model from {}".format(init_model_path))
......@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer):
self._exe.run(fleet.startup_program)
namespace = "train.startup"
load_tree = envs.get_global_env(
"tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", "", namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace)
self.tree_info_path = envs.get_global_env(
"tree.tree_info_path", "", namespace)
save_init_model = envs.get_global_env(
"cluster.save_init_model", False, namespace)
init_model_path = envs.get_global_env(
"cluster.init_model_path", "", namespace)
load_tree = envs.get_global_env("tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
namespace)
self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
"", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
namespace)
save_init_model = envs.get_global_env("cluster.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
if load_tree:
# covert tree to tensor, set it into Fluid's variable.
for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor()
param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name)
param_t.set(param_array.astype('int32'), self._place)
......@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer):
def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format(
travel_array.shape[0]))
logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
0]))
return travel_array
def _tdm_layer_prepare(self):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer",
"TDM_Tree_Info", "TDM_Tree_Emb"]
special_param = [
"TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info", "TDM_Tree_Emb"
]
class TDMSingleTrainer(SingleTrainer):
def startup(self, context):
namespace = "train.startup"
load_persistables = envs.get_global_env(
"single.load_persistables", False, namespace)
load_persistables = envs.get_global_env("single.load_persistables",
False, namespace)
persistables_model_path = envs.get_global_env(
"single.persistables_model_path", "", namespace)
load_tree = envs.get_global_env(
"tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", "", namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace)
self.tree_info_path = envs.get_global_env(
"tree.tree_info_path", "", namespace)
self.tree_emb_path = envs.get_global_env(
"tree.tree_emb_path", "", namespace)
save_init_model = envs.get_global_env(
"single.save_init_model", False, namespace)
init_model_path = envs.get_global_env(
"single.init_model_path", "", namespace)
load_tree = envs.get_global_env("tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
namespace)
self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
"", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
namespace)
self.tree_emb_path = envs.get_global_env("tree.tree_emb_path", "",
namespace)
save_init_model = envs.get_global_env("single.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("single.init_model_path", "",
namespace)
self._exe.run(fluid.default_startup_program())
if load_persistables:
......@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer):
if load_tree:
# covert tree to tensor, set it into Fluid's variable.
for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor()
param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name)
if param_name == 'TDM_Tree_Emb':
param_t.set(param_array.astype('float32'), self._place)
......@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer):
def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format(
travel_array.shape[0]))
logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
0]))
return travel_array
def _tdm_emb_prepare(self):
"""load tdm tree param from npy/list file"""
emb_array = np.load(self.tree_emb_path)
logger.info("TDM Tree node nums from emb: {}".format(
emb_array.shape[0]))
logger.info("TDM Tree node nums from emb: {}".format(emb_array.shape[
0]))
return emb_array
def _tdm_layer_prepare(self):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with DistributeTranspiler
"""
......@@ -39,9 +38,12 @@ class TranspileTrainer(Trainer):
self.increment_models = []
def processor_register(self):
print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first")
print(
"Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first"
)
def _get_dataloader(self, state="TRAIN"):
if state == "TRAIN":
dataloader = self.model._data_loader
namespace = "train.reader"
......@@ -59,12 +61,14 @@ class TranspileTrainer(Trainer):
if sparse_slots is None and dense_slots is None:
reader_class = envs.get_global_env("class", None, namespace)
reader = dataloader_instance.dataloader(
reader_class, state, self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class, class_name)
reader = dataloader_instance.dataloader(reader_class, state,
self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class,
class_name)
reader_ins = reader_class(self._config_yaml)
else:
reader = dataloader_instance.slotdataloader("", state, self._config_yaml)
reader = dataloader_instance.slotdataloader("", state,
self._config_yaml)
reader_ins = SlotReader(self._config_yaml)
if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
......@@ -94,13 +98,13 @@ class TranspileTrainer(Trainer):
if state == "TRAIN":
inputs = self.model.get_inputs()
namespace = "train.reader"
train_data_path = envs.get_global_env(
"train_data_path", None, namespace)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
else:
inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader"
train_data_path = envs.get_global_env(
"test_data_path", None, namespace)
train_data_path = envs.get_global_env("test_data_path", None,
namespace)
sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
dense_slots = envs.get_global_env("dense_slots", None, namespace)
......@@ -112,8 +116,8 @@ class TranspileTrainer(Trainer):
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
if sparse_slots is None and dense_slots is None:
pipe_cmd = "python {} {} {} {}".format(
reader, reader_class, state, self._config_yaml)
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
self._config_yaml)
else:
padding = envs.get_global_env("padding", 0, namespace)
pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
......@@ -123,8 +127,8 @@ class TranspileTrainer(Trainer):
if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
train_data_path = os.path.join(
package_base, train_data_path.split("::")[1])
train_data_path = os.path.join(package_base,
train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -140,11 +144,11 @@ class TranspileTrainer(Trainer):
debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
if debug_mode:
print(
"--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0]))
print("--- Dataset Debug Mode Begin , show pre 10 data of {}---".
format(file_list[0]))
os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
print(
"--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0]))
print("--- Dataset Debug Mode End , show pre 10 data of {}---".
format(file_list[0]))
exit(0)
return dataset
......@@ -166,27 +170,29 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False):
return
feed_varnames = envs.get_global_env(
"save.inference.feed_varnames", None, namespace)
feed_varnames = envs.get_global_env("save.inference.feed_varnames",
None, namespace)
fetch_varnames = envs.get_global_env(
"save.inference.fetch_varnames", None, namespace)
if feed_varnames is None or fetch_varnames is None:
return
fetch_vars = [fluid.default_main_program().global_block().vars[varname]
for varname in fetch_varnames]
dirname = envs.get_global_env(
"save.inference.dirname", None, namespace)
fetch_vars = [
fluid.default_main_program().global_block().vars[varname]
for varname in fetch_varnames
]
dirname = envs.get_global_env("save.inference.dirname", None,
namespace)
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
if is_fleet:
fleet.save_inference_model(
self._exe, dirname, feed_varnames, fetch_vars)
fleet.save_inference_model(self._exe, dirname, feed_varnames,
fetch_vars)
else:
fluid.io.save_inference_model(
dirname, feed_varnames, fetch_vars, self._exe)
fluid.io.save_inference_model(dirname, feed_varnames,
fetch_vars, self._exe)
self.inference_models.append((epoch_id, dirname))
def save_persistables():
......@@ -196,8 +202,8 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False):
return
dirname = envs.get_global_env(
"save.increment.dirname", None, namespace)
dirname = envs.get_global_env("save.increment.dirname", None,
namespace)
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
......@@ -275,10 +281,9 @@ class TranspileTrainer(Trainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames,
return_numpy=is_return_numpy)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames,
return_numpy=is_return_numpy)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......
......@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util
class DatasetHolder(object):
"""
Dataset Base
Dataset Holder
"""
__metaclass__ = abc.ABCMeta
......@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder):
Dataset.__init__(self, config)
if 'data_donefile' not in config or config['data_donefile'] is None:
config['data_donefile'] = config['data_path'] + "/to.hadoop.done"
self._path_generator = util.PathGenerator({'templates': [
{'name': 'data_path', 'template': config['data_path']},
{'name': 'donefile_path', 'template': config['data_donefile']}
]})
self._split_interval = config['split_interval'] # data split N mins per dir
self._path_generator = util.PathGenerator({
'templates': [{
'name': 'data_path',
'template': config['data_path']
}, {
'name': 'donefile_path',
'template': config['data_donefile']
}]
})
self._split_interval = config[
'split_interval'] # data split N mins per dir
self._data_file_handler = fs.FileHandler(config)
def _format_data_time(self, daytime_str, time_window_mins):
......@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder):
return None, 0
if mins_of_day % self._split_interval != 0:
skip_mins = self._split_interval - (mins_of_day % self._split_interval)
skip_mins = self._split_interval - (mins_of_day %
self._split_interval)
data_time = data_time + datetime.timedelta(minutes=skip_mins)
time_window_mins = time_window_mins - skip_mins
return data_time, time_window_mins
......@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder):
True/False
"""
is_ready = True
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins)
data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0:
file_path = self._path_generator.generate_path('donefile_path', {'time_format': data_time})
file_path = self._path_generator.generate_path(
'donefile_path', {'time_format': data_time})
if not self._data_file_handler.is_exist(file_path):
is_ready = False
break
time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return is_ready
def get_file_list(self, daytime_str, time_window_mins, node_num=1, node_idx=0):
def get_file_list(self,
daytime_str,
time_window_mins,
node_num=1,
node_idx=0):
"""
data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx]
Args:
......@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder):
list, data_shard[node_idx]
"""
data_file_list = []
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins)
data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0:
file_path = self._path_generator.generate_path('data_path', {'time_format': data_time})
file_path = self._path_generator.generate_path(
'data_path', {'time_format': data_time})
sub_file_list = self._data_file_handler.ls(file_path)
for sub_file in sub_file_list:
sub_file_name = self._data_file_handler.get_file_name(sub_file)
if not sub_file_name.startswith(self._config['filename_prefix']):
if not sub_file_name.startswith(self._config[
'filename_prefix']):
continue
if hash(sub_file_name) % node_num == node_idx:
data_file_list.append(sub_file)
time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return data_file_list
def _alloc_dataset(self, file_list):
""" """
dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type'])
dataset = fluid.DatasetFactory().create_dataset(self._config[
'dataset_type'])
dataset.set_batch_size(self._config['batch_size'])
dataset.set_thread(self._config['load_thread'])
dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi'])
dataset.set_hdfs_config(self._config['fs_name'],
self._config['fs_ugi'])
dataset.set_pipe_command(self._config['data_converter'])
dataset.set_filelist(file_list)
dataset.set_use_var(self._config['data_vars'])
......@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder):
while self.check_ready(begin_time, windown_min) == False:
print("dataset not ready, time:" + begin_time)
time.sleep(30)
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx'])
file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].load_into_memory()
else:
......@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min = params['time_window_min']
if begin_time not in self._datasets:
if self.check_ready(begin_time, windown_min):
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx'])
file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].preload_into_memory(self._config['preload_thread'])
self._datasets[begin_time].preload_into_memory(self._config[
'preload_thread'])
return True
return False
......
......@@ -17,10 +17,11 @@ import sys
from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.reader import SlotReader
from paddlerec.core.utils import envs
if len(sys.argv) < 4:
raise ValueError("reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path")
raise ValueError(
"reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path"
)
reader_package = sys.argv[1]
......
......@@ -95,7 +95,7 @@ def path_adapter(path):
l_p = path.split("paddlerec.")[1].replace(".", "/")
return os.path.join(package, l_p)
else:
return path
return path
def windows_path_converter(path):
......@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None):
def lazy_instance_by_package(package, class_name):
models = get_global_env("train.model.models")
model_package = __import__(
package, globals(), locals(), package.split("."))
model_package = __import__(package,
globals(), locals(), package.split("."))
instance = getattr(model_package, class_name)
return instance
......@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name):
sys.path.append(dirname)
package = os.path.splitext(os.path.basename(abs))[0]
model_package = __import__(
package, globals(), locals(), package.split("."))
model_package = __import__(package,
globals(), locals(), package.split("."))
instance = getattr(model_package, class_name)
return instance
......@@ -189,8 +189,7 @@ def get_platform():
def find_free_port():
def __free_port():
with closing(socket.socket(socket.AF_INET,
socket.SOCK_STREAM)) as s:
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
......
......@@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
def is_afs_path(path):
"""R
"""is_afs_path
"""
if path.startswith("afs") or path.startswith("hdfs"):
return True
......@@ -133,8 +133,9 @@ class FileHandler(object):
if mode.find('a') >= 0:
org_content = self._hdfs_client.cat(dest_path)
content = content + org_content
self._local_fs_client.write(content, temp_local_file,
mode) # fleet hdfs_client only support upload, so write tmp file
self._local_fs_client.write(
content, temp_local_file, mode
) # fleet hdfs_client only support upload, so write tmp file
self._hdfs_client.delete(dest_path + ".tmp")
self._hdfs_client.upload(dest_path + ".tmp", temp_local_file)
self._hdfs_client.delete(dest_path + ".bak")
......@@ -158,7 +159,8 @@ class FileHandler(object):
files = []
if is_afs_path(path):
files = self._hdfs_client.ls(path)
files = [path + '/' + self.get_file_name(fi) for fi in files] # absulte path
files = [path + '/' + self.get_file_name(fi)
for fi in files] # absulte path
else:
files = self._local_fs_client.ls(path)
files = [path + '/' + fi for fi in files] # absulte path
......
......@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs
def save_program_proto(path, program=None):
if program is None:
_program = fluid.default_main_program()
else:
......@@ -175,7 +176,8 @@ class PathGenerator(object):
"""
if template_name in self._templates:
if 'time_format' in param:
str = param['time_format'].strftime(self._templates[template_name])
str = param['time_format'].strftime(self._templates[
template_name])
return str.format(**param)
return self._templates[template_name].format(**param)
else:
......@@ -198,31 +200,39 @@ class TimeTrainPass(object):
self._begin_day = make_datetime(day_fields[0].strip())
if len(day_fields) == 1 or len(day_fields[1]) == 0:
# 100 years, meaning to continuous running
self._end_day = self._begin_day + datetime.timedelta(days=36500)
self._end_day = self._begin_day + datetime.timedelta(
days=36500)
else:
# example: 2020212+10
run_day = int(day_fields[1].strip())
self._end_day = self._begin_day + datetime.timedelta(days=run_day)
self._end_day = self._begin_day + datetime.timedelta(
days=run_day)
else:
# example: {20191001..20191031}
days = os.popen("echo -n " + self._config['days']).read().split(" ")
days = os.popen("echo -n " + self._config['days']).read().split(
" ")
self._begin_day = make_datetime(days[0])
self._end_day = make_datetime(days[len(days) - 1])
self._checkpoint_interval = self._config['checkpoint_interval']
self._dump_inference_interval = self._config['dump_inference_interval']
self._interval_per_pass = self._config['train_time_interval'] # train N min data per pass
self._interval_per_pass = self._config[
'train_time_interval'] # train N min data per pass
self._pass_id = 0
self._inference_pass_id = 0
self._pass_donefile_handler = None
if 'pass_donefile_name' in self._config:
self._train_pass_donefile = global_config['output_path'] + '/' + self._config['pass_donefile_name']
self._train_pass_donefile = global_config[
'output_path'] + '/' + self._config['pass_donefile_name']
if fs.is_afs_path(self._train_pass_donefile):
self._pass_donefile_handler = fs.FileHandler(global_config['io']['afs'])
self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['afs'])
else:
self._pass_donefile_handler = fs.FileHandler(global_config['io']['local_fs'])
self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['local_fs'])
last_done = self._pass_donefile_handler.cat(self._train_pass_donefile).strip().split('\n')[-1]
last_done = self._pass_donefile_handler.cat(
self._train_pass_donefile).strip().split('\n')[-1]
done_fileds = last_done.split('\t')
if len(done_fileds) > 4:
self._base_key = done_fileds[1]
......@@ -236,15 +246,18 @@ class TimeTrainPass(object):
"""
return 24 * 60 / self._interval_per_pass
def save_train_progress(self, day, pass_id, base_key, model_path, is_checkpoint):
def save_train_progress(self, day, pass_id, base_key, model_path,
is_checkpoint):
"""R
"""
if is_checkpoint:
self._checkpoint_pass_id = pass_id
self._checkpoint_model_path = model_path
done_content = "%s\t%s\t%s\t%s\t%d\n" % (day, base_key,
self._checkpoint_model_path, self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content, self._train_pass_donefile, 'a')
done_content = "%s\t%s\t%s\t%s\t%d\n" % (
day, base_key, self._checkpoint_model_path,
self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content,
self._train_pass_donefile, 'a')
pass
def init_pass_by_id(self, date_str, pass_id):
......@@ -286,12 +299,14 @@ class TimeTrainPass(object):
if self._pass_id < 1:
self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M"))
else:
next_time = self._current_train_time + datetime.timedelta(minutes=self._interval_per_pass)
next_time = self._current_train_time + datetime.timedelta(
minutes=self._interval_per_pass)
if (next_time - self._end_day).total_seconds() > 0:
has_next = False
else:
self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M"))
if has_next and (self._inference_pass_id < self._pass_id or self._pass_id < old_pass_id):
if has_next and (self._inference_pass_id < self._pass_id or
self._pass_id < old_pass_id):
self._inference_pass_id = self._pass_id - 1
return has_next
......@@ -319,9 +334,11 @@ class TimeTrainPass(object):
Return:
date(current_train_time + delta_day)
"""
return (self._current_train_time + datetime.timedelta(days=delta_day)).strftime("%Y%m%d")
return (self._current_train_time + datetime.timedelta(days=delta_day)
).strftime("%Y%m%d")
def timestamp(self, delta_day=0):
"""R
"""
return (self._current_train_time + datetime.timedelta(days=delta_day)).timestamp()
return (self._current_train_time + datetime.timedelta(days=delta_day)
).timestamp()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# PaddleRec Benchmark
> 占位
\ No newline at end of file
> 占位
# PaddleRec 贡献代码
> 占位
\ No newline at end of file
> 占位
......@@ -279,4 +279,4 @@ class Metric(object):
pass
```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
\ No newline at end of file
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
......@@ -7,5 +7,3 @@
### K8S集群运行分布式
> 占位
# 常见问题FAQ
> 占位
\ No newline at end of file
> 占位
# PaddleRec 单机训练
> 占位
\ No newline at end of file
> 占位
......@@ -12,4 +12,3 @@
| 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ |
# PaddleRec 模型调参
> 占位
\ No newline at end of file
> 占位
# PaddleRec 离线预测
\ No newline at end of file
# PaddleRec 离线预测
......@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -37,4 +37,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
......@@ -31,7 +31,8 @@ class Model(ModelBase):
def train_net(self):
""" network definition """
data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64')
data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
......@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer
fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim)
# softmax layer
prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax")
prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from paddlerec.core.reader import Reader
......@@ -38,7 +37,8 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
......@@ -47,4 +47,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
......@@ -26,8 +26,10 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
self.cost = None
self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace)
self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace)
......@@ -35,8 +37,9 @@ class Model(ModelBase):
self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
def train_net(self):
""" network definition """
text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64')
""" network"""
text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data(
......@@ -45,13 +48,19 @@ class Model(ModelBase):
self._data_var = [text, pos_tag, neg_tag]
text_emb = fluid.embedding(
input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb")
input=text,
size=[self.vocab_text_size, self.emb_dim],
param_attr="text_emb")
text_emb = fluid.layers.squeeze(input=text_emb, axes=[1])
pos_tag_emb = fluid.embedding(
input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb")
input=pos_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1])
neg_tag_emb = fluid.embedding(
input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb")
input=neg_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1])
conv_1d = fluid.nets.sequence_conv_pool(
......@@ -65,7 +74,8 @@ class Model(ModelBase):
size=self.emb_dim,
param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_text_hid = fluid.layers.sequence_expand_as(
x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(
input=mul_cos_neg, new_dim=self.neg_size)
......@@ -74,7 +84,10 @@ class Model(ModelBase):
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'),
input=cos_pos,
shape=[-1, 1],
value=self.margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
......@@ -85,7 +98,7 @@ class Model(ModelBase):
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
self.cost = avg_cost
self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos
......@@ -96,7 +109,8 @@ class Model(ModelBase):
return self.metrics
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import numpy as np
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -23,13 +23,26 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace)
Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i
in range(Neg)]
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
Neg = envs.get_global_env("hyper_parameters.NEG", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self.doc_negs = [
fluid.data(
name="doc_neg_" + str(i),
shape=[-1, TRIGRAM_D],
dtype="float32",
lod_level=0) for i in range(Neg)
]
self._data_var.append(self.query)
self._data_var.append(self.doc_pos)
for input in self.doc_negs:
......@@ -37,16 +50,24 @@ class Model(ModelBase):
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace)
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
self._namespace)
def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data]
for i in range(len(hidden_layers)):
xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i])
xavier = fluid.initializer.Xavier(
uniform=True,
fan_in=fc_inputs[-1].shape[1],
fan_out=hidden_layers[i])
out = fluid.layers.fc(input=fc_inputs[-1],
size=hidden_layers[i],
act=hidden_acts[i],
......@@ -56,8 +77,10 @@ class Model(ModelBase):
fc_inputs.append(out)
return fc_inputs[-1]
query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
query_fc = fc(self.query, hidden_layers, hidden_acts,
['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
if is_infer:
......@@ -65,13 +88,17 @@ class Model(ModelBase):
R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts,
['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)])
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
concat_Rs = fluid.layers.concat(
input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss)
......@@ -91,18 +118,28 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.SGD(learning_rate)
return optimizer
def infer_input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self._infer_data_var = [self.query, self.doc_pos]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
......
......@@ -22,4 +22,3 @@ mkdir -p data/train
mkdir -p data/test
python generate_synthetic_data.py
......@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model")
self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = []
for i in range(self.query_slots):
......
......@@ -21,7 +21,11 @@ class Dataset:
class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000):
def __init__(self,
sparse_feature_dim,
query_slot_num,
title_slot_num,
dataset_size=10000):
# ids are randomly generated
self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim
......@@ -46,14 +50,20 @@ class SyntheticDataset(Dataset):
for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
pt_slot = [str(fea) + ':' + str(i + self.query_slot_num) for fea in pt_slot]
pt_slot = [
str(fea) + ':' + str(i + self.query_slot_num)
for fea in pt_slot
]
pos_title_slots += pt_slot
if is_train:
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in
nt_slot]
nt_slot = [
str(fea) + ':' +
str(i + self.query_slot_num + self.title_slot_num)
for fea in nt_slot
]
neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots
else:
......@@ -76,7 +86,8 @@ if __name__ == '__main__':
query_slots = 1
title_slots = 1
dataset_size = 10
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size)
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
dataset_size)
train_reader = dataset.train()
test_reader = dataset.test()
......
......@@ -103,12 +103,18 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace)
title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace)
query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None, self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None, self._namespace)
query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
None, self._namespace)
title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
None, self._namespace)
query_encode_dim = envs.get_global_env(
"hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
self._namespace)
factory = SimpleEncoderFactory()
self.query_encoders = [
factory.create(query_encoder, query_encode_dim)
......@@ -119,10 +125,13 @@ class Model(ModelBase):
for i in range(title_slots)
]
self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace)
self.emb_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
None, self._namespace)
self.emb_shape = [self.emb_size, self.emb_dim]
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
None, self._namespace)
self.margin = 0.1
def input(self, is_train=True):
......@@ -133,8 +142,10 @@ class Model(ModelBase):
]
self.pt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders)), shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))
name="%d" % (i + len(self.query_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
]
if is_train == False:
......@@ -142,9 +153,11 @@ class Model(ModelBase):
self.nt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1,
dtype='int64')
for i in range(len(self.title_encoders))
name="%d" %
(i + len(self.query_encoders) + len(self.title_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
]
return self.q_slots + self.pt_slots + self.nt_slots
......@@ -153,11 +166,15 @@ class Model(ModelBase):
res = self.input()
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def get_acc(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
......@@ -190,10 +207,12 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
nt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
]
# concat multi view for query, pos_title, neg_title
......@@ -252,7 +271,8 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
return optimizer
......@@ -261,7 +281,10 @@ class Model(ModelBase):
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
......@@ -281,7 +304,8 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
......
......@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model")
self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = []
for i in range(self.query_slots):
......
......@@ -37,4 +37,3 @@
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
all_field_id = [
'101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
......
......@@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
all_field_id = [
'101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
......
......@@ -28,11 +28,13 @@ class Model(ModelBase):
init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1])
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(loc=0.0,
scale=init_stddev * scales))
p_attr = fluid.param_attr.ParamAttr(
name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data,
size=out_dim,
......@@ -44,7 +46,11 @@ class Model(ModelBase):
def input_data(self):
sparse_input_ids = [
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23)
fluid.data(
name="field_" + str(i),
shape=[-1, 1],
dtype="int64",
lod_level=1) for i in range(0, 23)
]
label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64")
label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64")
......@@ -55,19 +61,23 @@ class Model(ModelBase):
def net(self, inputs, is_infer=False):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None,
self._namespace)
emb = []
for data in inputs[0:-2]:
feat_emb = fluid.embedding(input=data,
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)
),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum')
feat_emb = fluid.embedding(
input=data,
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(
name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(
input=feat_emb, pool_type='sum')
emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1)
......@@ -85,14 +95,20 @@ class Model(ModelBase):
ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1]
ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctr_prop_one = fluid.layers.slice(
ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(
cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one,
cvr_prop_one)
ctcvr_prop = fluid.layers.concat(
input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(
input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(
input=ctcvr_prop, label=ctcvr_buy)
if is_infer:
self._infer_results["AUC_ctr"] = auc_ctr
......@@ -100,7 +116,8 @@ class Model(ModelBase):
return
loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
loss_ctcvr = fluid.layers.cross_entropy(
input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost)
......@@ -117,5 +134,8 @@ class Model(ModelBase):
def infer_net(self):
self._infer_data_var = self.input_data()
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True)
......@@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
pass
def generate_sample(self, line):
......
......@@ -24,6 +24,7 @@ class TrainReader(Reader):
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
......
......@@ -23,44 +23,58 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def MMOE(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None, self._namespace)
gate_num = envs.get_global_env("hyper_parameters.gate_num", None, self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace)
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
feature_size = envs.get_global_env("hyper_parameters.feature_size",
None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None,
self._namespace)
gate_num = envs.get_global_env("hyper_parameters.gate_num", None,
self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None,
self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer:
self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital])
# f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper
expert_outputs = []
for i in range(0, expert_num):
expert_output = fluid.layers.fc(input=input_data,
size=expert_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_output = fluid.layers.fc(
input=input_data,
size=expert_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_outputs.append(expert_output)
expert_concat = fluid.layers.concat(expert_outputs, axis=1)
expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size])
expert_concat = fluid.layers.reshape(expert_concat,
[-1, expert_num, expert_size])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers = []
for i in range(0, gate_num):
cur_gate = fluid.layers.fc(input=input_data,
size=expert_num,
act='softmax',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
cur_gate = fluid.layers.fc(
input=input_data,
size=expert_num,
act='softmax',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
# f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x))
cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.elementwise_mul(
expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1)
# Build tower layer
cur_tower = fluid.layers.fc(input=cur_gate_expert,
......@@ -74,25 +88,33 @@ class Model(ModelBase):
output_layers.append(out)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
pred_income = fluid.layers.clip(
output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(
label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(
label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost_income = fluid.layers.cross_entropy(
input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
avg_cost_income = fluid.layers.mean(x=cost_income)
avg_cost_marital = fluid.layers.mean(x=cost_marital)
......
......@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
......@@ -24,27 +24,38 @@ class Model(ModelBase):
def model(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace)
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace)
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None, self._namespace)
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
feature_size = envs.get_global_env("hyper_parameters.feature_size",
None, self._namespace)
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None,
self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
self._namespace)
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None,
self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer:
self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital])
bottom_output = fluid.layers.fc(input=input_data,
size=bottom_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
bottom_output = fluid.layers.fc(
input=input_data,
size=bottom_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
# Build tower layer from bottom layer
output_layers = []
......@@ -59,26 +70,34 @@ class Model(ModelBase):
name='output_layer_' + str(index))
output_layers.append(output_layer)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
pred_income = fluid.layers.clip(
output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(
label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(
label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost_income = fluid.layers.cross_entropy(
input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1)
avg_cost = fluid.layers.mean(x=cost)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import io
......
......@@ -26,8 +26,8 @@ from collections import Counter
import os
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator):
if idx == 2 else math.log(1 + float(features[idx])))
for idx in self.cat_idx_:
if features[idx] == '' or features[
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
label_feat_list[idx].append(0)
else:
label_feat_list[idx].append(self.cat_feat_idx_dict_list[
idx - 14][features[idx]])
idx - 14][features[idx]])
label_feat_list[0].append(int(features[0]))
return label_feat_list
......@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, absolute_import, division
import os
......
......@@ -25,12 +25,18 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def init_network(self):
self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace)
self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace)
self.l2_reg_cross = envs.get_global_env("hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", None, self._namespace)
self.clip_by_norm = envs.get_global_env("hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", None, self._namespace)
self.cross_num = envs.get_global_env("hyper_parameters.cross_num",
None, self._namespace)
self.dnn_hidden_units = envs.get_global_env(
"hyper_parameters.dnn_hidden_units", None, self._namespace)
self.l2_reg_cross = envs.get_global_env(
"hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn",
None, self._namespace)
self.clip_by_norm = envs.get_global_env(
"hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num",
None, self._namespace)
self.sparse_inputs = self._sparse_data_var[1:]
self.dense_inputs = self._dense_data_var
......@@ -43,7 +49,8 @@ class Model(ModelBase):
cat_feat_dims_dict[spls[0]] = int(spls[1])
self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict(
)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", None, self._namespace)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
None, self._namespace)
self.dense_feat_names = [i.name for i in self.dense_inputs]
self.sparse_feat_names = [i.name for i in self.sparse_inputs]
......@@ -55,16 +62,19 @@ class Model(ModelBase):
self.net_input = None
self.loss = None
def _create_embedding_input(self):
# sparse embedding
sparse_emb_dict = OrderedDict()
for var in self.sparse_inputs:
sparse_emb_dict[var.name] = fluid.embedding(input=var,
size=[self.feat_dims_dict[var.name] + 1,
6 * int(pow(self.feat_dims_dict[var.name], 0.25))
],is_sparse=self.is_sparse)
sparse_emb_dict[var.name] = fluid.embedding(
input=var,
size=[
self.feat_dims_dict[var.name] + 1,
6 * int(pow(self.feat_dims_dict[var.name], 0.25))
],
is_sparse=self.is_sparse)
# combine dense and sparse_emb
dense_input_list = self.dense_inputs
sparse_emb_list = list(sparse_emb_dict.values())
......@@ -114,10 +124,11 @@ class Model(ModelBase):
def train_net(self):
self.model._init_slots()
self.init_network()
self.net_input = self._create_embedding_input()
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False)
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units,
self.dnn_use_bn, False)
cross_out, l2_reg_cross_loss = self._cross_net(self.net_input,
self.cross_num)
......@@ -134,9 +145,11 @@ class Model(ModelBase):
input=prob_2d, label=label_int, slide_steps=0)
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
# logloss
logloss = fluid.layers.log_loss(self.prob, fluid.layers.cast(self.target_input, dtype='float32'))
logloss = fluid.layers.log_loss(
self.prob, fluid.layers.cast(
self.target_input, dtype='float32'))
self.avg_logloss = fluid.layers.reduce_mean(logloss)
# reg_coeff * l2_reg_cross
......@@ -145,7 +158,8 @@ class Model(ModelBase):
self._cost = self.loss
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -19,8 +19,9 @@ try:
import cPickle as pickle
except ImportError:
import pickle
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self.categorical_range_ = range(14, 40)
# load preprocessed feature dict
self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
......@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy
from collections import Counter
......
......@@ -27,21 +27,26 @@ class Model(ModelBase):
def deepfm_net(self):
init_value_ = 0.1
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace)
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
reg = envs.get_global_env("hyper_parameters.reg", 1e-4,
self._namespace)
first_weights_re = fluid.embedding(
input=feat_idx,
is_sparse=True,
......@@ -55,7 +60,8 @@ class Model(ModelBase):
regularizer=fluid.regularizer.L1DecayRegularizer(reg)))
first_weights = fluid.layers.reshape(
first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1)
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value),
1)
# ------------------------- second order term --------------------------
......@@ -68,7 +74,8 @@ class Model(ModelBase):
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
loc=0.0,
scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
feat_embeddings = fluid.layers.reshape(
feat_embeddings_re,
shape=[-1, num_field,
......@@ -76,8 +83,8 @@ class Model(ModelBase):
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# sum_square part
summed_features_emb = fluid.layers.reduce_sum(feat_embeddings,
1) # None * embedding_size
summed_features_emb = fluid.layers.reduce_sum(
feat_embeddings, 1) # None * embedding_size
summed_features_emb_square = fluid.layers.square(
summed_features_emb) # None * embedding_size
......@@ -88,13 +95,16 @@ class Model(ModelBase):
squared_features_emb, 1) # None * embedding_size
y_second_order = 0.5 * fluid.layers.reduce_sum(
summed_features_emb_square - squared_sum_features_emb, 1,
summed_features_emb_square - squared_sum_features_emb,
1,
keep_dim=True) # None * 1
# ------------------------- DNN --------------------------
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim])
for s in layer_sizes:
......@@ -121,7 +131,8 @@ class Model(ModelBase):
# ------------------------- DeepFM --------------------------
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn)
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order +
y_dnn)
def train_net(self):
self.model._init_slots()
......@@ -129,7 +140,8 @@ class Model(ModelBase):
# ------------------------- Cost(logloss) --------------------------
cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32"))
cost = fluid.layers.log_loss(
input=self.predict, label=fluid.layers.cast(self.label, "float32"))
avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost
......@@ -145,7 +157,8 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import pickle
import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
......@@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def config_read(self, config_path):
with open(config_path, "r") as fin:
user_count = int(fin.readline().strip())
item_count = int(fin.readline().strip())
cat_count = int(fin.readline().strip())
return user_count, item_count, cat_count
def din_attention(self, hist, target_expand, mask):
"""activation weight"""
......@@ -58,56 +58,66 @@ class Model(ModelBase):
out = fluid.layers.matmul(weight, hist)
out = fluid.layers.reshape(x=out, shape=[0, hidden_size])
return out
def train_net(self):
seq_len = -1
self.item_emb_size = envs.get_global_env("hyper_parameters.item_emb_size", 64, self._namespace)
self.cat_emb_size = envs.get_global_env("hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", self._namespace)
self.item_emb_size = envs.get_global_env(
"hyper_parameters.item_emb_size", 64, self._namespace)
self.cat_emb_size = envs.get_global_env(
"hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid",
self._namespace)
#item_emb_size = 64
#cat_emb_size = 64
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", False, self._namespace)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False, self._namespace)
#significant for speeding up the training process
self.config_path = envs.get_global_env("hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
self.config_path = envs.get_global_env(
"hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env(
"hyper_parameters.use_DataLoader", False, self._namespace)
user_count, item_count, cat_count = self.config_read(self.config_path)
item_emb_attr = fluid.ParamAttr(name="item_emb")
cat_emb_attr = fluid.ParamAttr(name="cat_emb")
hist_item_seq = fluid.data(
name="hist_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_item_seq)
hist_cat_seq = fluid.data(
name="hist_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_cat_seq)
target_item = fluid.data(name="target_item", shape=[None], dtype="int64")
target_item = fluid.data(
name="target_item", shape=[None], dtype="int64")
self._data_var.append(target_item)
target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64")
self._data_var.append(target_cat)
label = fluid.data(name="label", shape=[None, 1], dtype="float32")
self._data_var.append(label)
mask = fluid.data(name="mask", shape=[None, seq_len, 1], dtype="float32")
mask = fluid.data(
name="mask", shape=[None, seq_len, 1], dtype="float32")
self._data_var.append(mask)
target_item_seq = fluid.data(
name="target_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_item_seq)
target_cat_seq = fluid.data(
name="target_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_cat_seq)
if self.use_DataLoader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=10000, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=10000,
use_double_buffer=False,
iterable=False)
hist_item_emb = fluid.embedding(
input=hist_item_seq,
size=[item_count, self.item_emb_size],
......@@ -149,7 +159,8 @@ class Model(ModelBase):
size=[item_count, 1],
param_attr=fluid.initializer.Constant(value=0.0))
hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2)
hist_seq_concat = fluid.layers.concat(
[hist_item_emb, hist_cat_emb], axis=2)
target_seq_concat = fluid.layers.concat(
[target_item_seq_emb, target_cat_seq_emb], axis=2)
target_concat = fluid.layers.concat(
......@@ -157,21 +168,22 @@ class Model(ModelBase):
out = self.din_attention(hist_seq_concat, target_seq_concat, mask)
out_fc = fluid.layers.fc(name="out_fc",
input=out,
size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1)
input=out,
size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1)
embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1)
fc1 = fluid.layers.fc(name="fc1",
input=embedding_concat,
size=80,
act=self.act)
input=embedding_concat,
size=80,
act=self.act)
fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act)
fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1)
logit = fc3 + item_b
loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logit, label=label)
avg_loss = fluid.layers.mean(loss)
self._cost = avg_loss
......@@ -179,14 +191,14 @@ class Model(ModelBase):
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(label, 'int64')
auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
label=label_int,
slide_steps=0)
label=label_int,
slide_steps=0)
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
......@@ -29,13 +29,15 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader")
self.train_data_path = envs.get_global_env("train_data_path", None,
"train.reader")
self.res = []
self.max_len = 0
data_file_list = os.listdir(self.train_data_path)
for i in range(0, len(data_file_list)):
train_data_file = os.path.join(self.train_data_path, data_file_list[i])
train_data_file = os.path.join(self.train_data_path,
data_file_list[i])
with open(train_data_file, "r") as fin:
for line in fin:
line = line.strip().split(';')
......@@ -78,11 +80,13 @@ class TrainReader(Reader):
len_array = [len(x[0]) for x in b]
mask = np.array(
[[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape(
[-1, max_len, 1])
[-1, max_len, 1])
target_item_seq = np.array(
[[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len])
[[x[2]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
target_cat_seq = np.array(
[[x[3]] * max_len for x in b]).astype("int64").reshape([-1, max_len])
[[x[3]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
res = []
for i in range(len(b)):
res.append([
......@@ -127,4 +131,5 @@ class TrainReader(Reader):
def generate_batch_from_trainfiles(self, files):
data_set = self.base_read(files)
random.shuffle(data_set)
return self.batch_reader(data_set, self.batch_size, self.batch_size * 20)
return self.batch_reader(data_set, self.batch_size,
self.batch_size * 20)
......@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
......@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
feature_name.append("label")
s = "click:" + str(label[0])
for i in dense_feature:
s += " dense_feature:" + str(i)
s += " dense_feature:" + str(i)
for i in range(1, 1 + len(categorical_range_)):
s += " " + str(i) + ":" + str(sparse_feature[i-1][0])
s += " " + str(i) + ":" + str(sparse_feature[i - 1][0])
print s.strip()
yield None
return reader
......
......@@ -31,8 +31,10 @@ class Model(ModelBase):
def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
def embedding_layer(input):
emb = fluid.layers.embedding(
......@@ -42,25 +44,27 @@ class Model(ModelBase):
size=[sparse_feature_number, sparse_feature_dim],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()),
)
emb_sum = fluid.layers.sequence_pool(
input=emb, pool_type='sum')
initializer=fluid.initializer.Uniform()), )
emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb_sum
def fc(input, output_size):
output = fluid.layers.fc(
input=input, size=output_size,
act='relu', param_attr=fluid.ParamAttr(
input=input,
size=output_size,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(input.shape[1]))))
return output
sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1)
concated = fluid.layers.concat(
sparse_embed_seq + [self.dense_input], axis=1)
fcs = [concated]
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
for size in hidden_layers:
fcs.append(fc(fcs[-1], size))
......@@ -75,14 +79,15 @@ class Model(ModelBase):
self.predict = predict
def avg_loss(self):
cost = fluid.layers.cross_entropy(input=self.predict, label=self.label_input)
cost = fluid.layers.cross_entropy(
input=self.predict, label=self.label_input)
avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost
def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
label=self.label_input,
num_thresholds=2 ** 12,
num_thresholds=2**12,
slide_steps=20)
self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc
......@@ -95,7 +100,8 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import io
import args
import pandas as pd
from sklearn import preprocessing
from sklearn import preprocessing
def _clean_file(source_path,target_path):
def _clean_file(source_path, target_path):
"""makes changes to match the CSV format."""
with io.open(source_path, 'r') as temp_eval_file:
with io.open(target_path, 'w') as eval_file:
......@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path):
line = line[:-1]
line += '\n'
eval_file.write(line)
def build_model_columns(train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
# Load the dataset in Pandas
......@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
categorical_columns = ['education','marital_status','relationship','workclass','occupation']
categorical_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation'
]
for col in categorical_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False)
base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets']
train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
crossed_columns = ['education_occupation','age_buckets_education_occupation']
test_df[col] = label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(),
bins,
labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(),
bins,
labels=False)
base_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation', 'age_buckets'
]
train_df['education_occupation'] = train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df[
'age_buckets'].astype(str) + '_' + train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df[
'age_buckets'].astype(str) + '_' + test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
crossed_columns = [
'education_occupation', 'age_buckets_education_occupation'
]
for col in crossed_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
test_df[col] = label_test.fit_transform(test_df[col])
wide_columns = base_columns + crossed_columns
train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns)
test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns)
train_df_temp = pd.get_dummies(
train_df[categorical_columns], columns=categorical_columns)
test_df_temp = pd.get_dummies(
test_df[categorical_columns], columns=categorical_columns)
train_df = train_df.join(train_df_temp)
test_df = test_df.join(test_df_temp)
deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week']
train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
deep_columns = list(train_df_temp.columns) + [
'age', 'education_num', 'capital_gain', 'capital_loss',
'hours_per_week'
]
train_df['label'] = train_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
with io.open('test_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
with io.open('test_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False)
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
train_data_path, index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
test_data_path, index=False)
def clean_file(train_path, test_path, train_data_path, test_data_path):
_clean_file(train_path, train_data_path)
_clean_file(test_path, test_data_path)
if __name__ == '__main__':
args = args.parse_args()
clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path)
clean_file(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
build_model_columns(args.train_data_path, args.test_data_path)
......@@ -20,6 +20,7 @@ except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
wide_feat, deep_deat, label = self._process_line(line)
s = ""
for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]:
for i in [('wide_input', wide_feat), ('deep_input', deep_deat),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
......@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
......@@ -25,27 +25,27 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def wide_part(self, data):
out = fluid.layers.fc(input=data,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
out = fluid.layers.fc(
input=data,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
return out
def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1]))),
act=active,
name=tag)
output = fluid.layers.fc(
input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
act=active,
name=tag)
return output
......@@ -62,43 +62,63 @@ class Model(ModelBase):
deep_input = self._dense_data_var[1]
label = self._sparse_data_var[0]
hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units", 75, self._namespace)
hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units", 50, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", 25, self._namespace)
hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units",
75, self._namespace)
hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units",
50, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units",
25, self._namespace)
wide_output = self.wide_part(wide_input)
deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, hidden3_units)
wide_model = fluid.layers.fc(input=wide_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(input=deep_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_deep')
deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units,
hidden3_units)
wide_model = fluid.layers.fc(
input=wide_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(
input=deep_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
act=None,
name='w_deep')
prediction = fluid.layers.elementwise_add(wide_model, deep_model)
pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction")
pred = fluid.layers.sigmoid(
fluid.layers.clip(
prediction, min=-15.0, max=15.0),
name="prediction")
num_seqs = fluid.layers.create_tensor(dtype='int64')
acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=label, dtype='int64'), total=num_seqs)
auc_var, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=label, dtype='int64'))
acc = fluid.layers.accuracy(
input=pred,
label=fluid.layers.cast(
x=label, dtype='int64'),
total=num_seqs)
auc_var, batch_auc, auc_states = fluid.layers.auc(
input=pred, label=fluid.layers.cast(
x=label, dtype='int64'))
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc
self._metrics["ACC"] = acc
cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=fluid.layers.cast(label, dtype='float32'))
cost = fluid.layers.sigmoid_cross_entropy_with_logits(
x=prediction, label=fluid.layers.cast(
label, dtype='float32'))
avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -21,6 +21,7 @@ except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
......@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
......@@ -28,18 +28,22 @@ class Model(ModelBase):
loc=0.0, scale=init_value_)
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
feat_embeddings = fluid.embedding(
input=feat_idx,
......@@ -48,9 +52,9 @@ class Model(ModelBase):
size=[sparse_feature_number + 1, sparse_feature_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(initializer=initer))
feat_embeddings = fluid.layers.reshape(
feat_embeddings,
[-1, num_field, sparse_feature_dim]) # None * num_field * embedding_size
feat_embeddings = fluid.layers.reshape(feat_embeddings, [
-1, num_field, sparse_feature_dim
]) # None * num_field * embedding_size
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# -------------------- linear --------------------
......@@ -73,7 +77,8 @@ class Model(ModelBase):
# -------------------- CIN --------------------
layer_sizes_cin = envs.get_global_env("hyper_parameters.layer_sizes_cin", None, self._namespace)
layer_sizes_cin = envs.get_global_env(
"hyper_parameters.layer_sizes_cin", None, self._namespace)
Xs = [feat_embeddings]
last_s = num_field
for s in layer_sizes_cin:
......@@ -84,7 +89,8 @@ class Model(ModelBase):
1]) # None, embedding_size, num_field, 1
X_k = fluid.layers.reshape(
fluid.layers.transpose(Xs[-1], [0, 2, 1]),
[-1, sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s
[-1, sparse_feature_dim, 1,
last_s]) # None, embedding_size, 1, last_s
Z_k_1 = fluid.layers.matmul(
X_0, X_k) # None, embedding_size, num_field, last_s
......@@ -124,16 +130,19 @@ class Model(ModelBase):
# -------------------- DNN --------------------
layer_sizes_dnn = envs.get_global_env("hyper_parameters.layer_sizes_dnn", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
layer_sizes_dnn = envs.get_global_env(
"hyper_parameters.layer_sizes_dnn", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim])
for s in layer_sizes_dnn:
y_dnn = fluid.layers.fc(input=y_dnn,
size=s,
act=act,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(
input=y_dnn,
size=s,
act=act,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(input=y_dnn,
size=1,
act=None,
......@@ -148,7 +157,10 @@ class Model(ModelBase):
self.model._init_slots()
self.xdeepfm_net()
cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32"), epsilon=0.0000001)
cost = fluid.layers.log_loss(
input=self.predict,
label=fluid.layers.cast(self.label, "float32"),
epsilon=0.0000001)
batch_cost = fluid.layers.reduce_mean(cost)
self._cost = batch_cost
......@@ -162,7 +174,8 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir test_data
mv diginetica/test.txt test_data
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.input = []
self.length = None
......@@ -34,7 +35,8 @@ class EvaluateReader(Reader):
with open(f, "r") as fin:
for line in fin:
line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])]))
res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res
def make_data(self, cur_batch, batch_size):
......@@ -75,10 +77,8 @@ class EvaluateReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append(
[[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)])
......@@ -101,10 +101,13 @@ class EvaluateReader(Reader):
def _reader():
random.shuffle(self.input)
group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size])
for bg_id in range(0, self.length - group_remain,
batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True)
cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size)
......
......@@ -30,15 +30,21 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
self.items_num, self.ins_num = self.config_read(
envs.get_global_env("hyper_parameters.config_path", None, self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace)
envs.get_global_env("hyper_parameters.config_path", None,
self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.hidden_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps", None, self._namespace)
def config_read(self, config_path=None):
if config_path is None:
raise ValueError("please set train.model.hyper_parameters.config_path at first")
raise ValueError(
"please set train.model.hyper_parameters.config_path at first")
with open(config_path, "r") as fin:
item_nums = int(fin.readline().strip())
ins_nums = int(fin.readline().strip())
......@@ -46,100 +52,108 @@ class Model(ModelBase):
def input(self, bs):
self.items = fluid.data(
name="items",
shape=[bs, -1],
name="items", shape=[bs, -1],
dtype="int64") # [batch_size, uniq_max]
self.seq_index = fluid.data(
name="seq_index",
shape=[bs, -1, 2],
name="seq_index", shape=[bs, -1, 2],
dtype="int32") # [batch_size, seq_max, 2]
self.last_index = fluid.data(
name="last_index",
shape=[bs, 2],
dtype="int32") # [batch_size, 2]
name="last_index", shape=[bs, 2], dtype="int32") # [batch_size, 2]
self.adj_in = fluid.data(
name="adj_in",
shape=[bs, -1, -1],
name="adj_in", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.adj_out = fluid.data(
name="adj_out",
shape=[bs, -1, -1],
name="adj_out", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.mask = fluid.data(
name="mask",
shape=[bs, -1, 1],
name="mask", shape=[bs, -1, 1],
dtype="float32") # [batch_size, seq_max, 1]
self.label = fluid.data(
name="label",
shape=[bs, 1],
dtype="int64") # [batch_size, 1]
name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label]
res = [
self.items, self.seq_index, self.last_index, self.adj_in,
self.adj_out, self.mask, self.label
]
return res
def train_input(self):
res = self.input(self.train_batch_size)
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def net(self, items_num, hidden_size, step, bs):
stdv = 1.0 / math.sqrt(hidden_size)
def embedding_layer(input, table_name, emb_dim, initializer_instance=None):
def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None):
emb = fluid.embedding(
input=input,
size=[items_num, emb_dim],
param_attr=fluid.ParamAttr(
name=table_name,
initializer=initializer_instance),
)
name=table_name, initializer=initializer_instance), )
return emb
sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer)
items_emb = embedding_layer(self.items, "emb", hidden_size,
sparse_initializer)
pre_state = items_emb
for i in range(step):
pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
pre_state = layers.reshape(
x=pre_state, shape=[bs, -1, hidden_size])
state_in = layers.fc(
input=pre_state,
name="state_in",
size=hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_out = layers.fc(
input=pre_state,
name="state_out",
size=hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in,
state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(
self.adj_out, state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
gru_fc = layers.fc(
input=gru_input,
name="gru_fc",
size=3 * hidden_size,
bias_attr=False)
gru_input = layers.reshape(
x=gru_input, shape=[-1, hidden_size * 2])
gru_fc = layers.fc(input=gru_input,
name="gru_fc",
size=3 * hidden_size,
bias_attr=False)
pre_state, _, _ = fluid.layers.gru_unit(
input=gru_fc,
hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]),
hidden=layers.reshape(
x=pre_state, shape=[-1, hidden_size]),
size=3 * hidden_size)
final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
......@@ -153,24 +167,22 @@ class Model(ModelBase):
bias_attr=False,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(
input=last,
name="last_fc",
size=hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [bathc_size, h]
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(input=last,
name="last_fc",
size=hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [bathc_size, h]
seq_fc_t = layers.transpose(
seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h]
add = layers.elementwise_add(
seq_fc_t, last_fc) # [seq_max, batch_size, h]
add = layers.elementwise_add(seq_fc_t,
last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter(
shape=[hidden_size],
dtype='float32',
......@@ -188,12 +200,13 @@ class Model(ModelBase):
act=None,
num_flatten_dims=2,
bias_attr=False,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
weight *= self.mask
weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h]
weight_mask = layers.elementwise_mul(
seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(
weight_mask, dim=1) # [batch_size, h]
final_attention = layers.concat(
[global_attention, last], axis=1) # [batch_size, 2*h]
......@@ -213,7 +226,8 @@ class Model(ModelBase):
# persistable=True,
# name="all_vocab")
all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64')
all_vocab = fluid.layers.cast(
x=fluid.layers.assign(all_vocab), dtype='int64')
all_emb = fluid.embedding(
input=all_vocab,
......@@ -240,15 +254,19 @@ class Model(ModelBase):
def train_net(self):
self.train_input()
self.net(self.items_num, self.hidden_size, self.step, self.train_batch_size)
self.net(self.items_num, self.hidden_size, self.step,
self.train_batch_size)
self.avg_loss()
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
step_per_epoch = self.ins_num // self.train_batch_size
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace)
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
......@@ -266,10 +284,14 @@ class Model(ModelBase):
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size)
self.net(self.items_num, self.hidden_size, self.step,
self.evaluate_batch_size)
self._infer_results['acc'] = self.acc
self._infer_results['loss'] = self.loss
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import pickle
......@@ -10,6 +24,7 @@ parser.add_argument(
help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample')
opt = parser.parse_args()
def process_data(file_type):
path = os.path.join(opt.data_dir, file_type)
output_path = os.path.splitext(path)[0] + ".txt"
......@@ -23,6 +38,7 @@ def process_data(file_type):
fout.write(str(data[i][1]))
fout.write("\n")
process_data("train")
process_data("test")
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
import sys
import time
......
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.input = []
self.length = None
......@@ -34,7 +35,8 @@ class TrainReader(Reader):
with open(f, "r") as fin:
for line in fin:
line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])]))
res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res
def make_data(self, cur_batch, batch_size):
......@@ -75,10 +77,8 @@ class TrainReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append(
[[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)])
......@@ -101,10 +101,13 @@ class TrainReader(Reader):
def _reader():
random.shuffle(self.input)
group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size])
for bg_id in range(0, self.length - group_remain,
batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True)
cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size)
......
......@@ -24,14 +24,22 @@ class Model(ModelBase):
def all_vocab_network(self, is_infer=False):
""" network definition """
recall_k = envs.get_global_env("hyper_parameters.recall_k", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
hid_size = envs.get_global_env("hyper_parameters.hid_size", None, self._namespace)
init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound", None, self._namespace)
init_high_bound = envs.get_global_env("hyper_parameters.init_high_bound", None, self._namespace)
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None, self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None, self._namespace)
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None, self._namespace)
recall_k = envs.get_global_env("hyper_parameters.recall_k", None,
self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
hid_size = envs.get_global_env("hyper_parameters.hid_size", None,
self._namespace)
init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound",
None, self._namespace)
init_high_bound = envs.get_global_env(
"hyper_parameters.init_high_bound", None, self._namespace)
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None,
self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None,
self._namespace)
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None,
self._namespace)
# Input data
src_wordseq = fluid.data(
name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1)
......@@ -41,7 +49,10 @@ class Model(ModelBase):
if is_infer:
self._infer_data_var = [src_wordseq, dst_wordseq]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
emb = fluid.embedding(
input=src_wordseq,
......@@ -56,7 +67,8 @@ class Model(ModelBase):
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
low=init_low_bound,
high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
......
......@@ -25,9 +25,12 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input_data(self, is_infer=False):
user_input = fluid.data(name="user_input", shape=[-1, 1], dtype="int64", lod_level=0)
item_input = fluid.data(name="item_input", shape=[-1, 1], dtype="int64", lod_level=0)
label = fluid.data(name="label", shape=[-1, 1], dtype="int64", lod_level=0)
user_input = fluid.data(
name="user_input", shape=[-1, 1], dtype="int64", lod_level=0)
item_input = fluid.data(
name="item_input", shape=[-1, 1], dtype="int64", lod_level=0)
label = fluid.data(
name="label", shape=[-1, 1], dtype="int64", lod_level=0)
if is_infer:
inputs = [user_input] + [item_input]
else:
......@@ -35,81 +38,104 @@ class Model(ModelBase):
self._data_var = inputs
return inputs
def net(self, inputs, is_infer=False):
num_users = envs.get_global_env("hyper_parameters.num_users", None, self._namespace)
num_items = envs.get_global_env("hyper_parameters.num_items", None, self._namespace)
latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None, self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace)
num_layer = len(layers) #Number of layers in the MLP
MF_Embedding_User = fluid.embedding(input=inputs[0],
size=[num_users, latent_dim],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
MF_Embedding_Item = fluid.embedding(input=inputs[1],
size=[num_items, latent_dim],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_User = fluid.embedding(input=inputs[0],
size=[num_users, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_Item = fluid.embedding(input=inputs[1],
size=[num_items, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
num_users = envs.get_global_env("hyper_parameters.num_users", None,
self._namespace)
num_items = envs.get_global_env("hyper_parameters.num_items", None,
self._namespace)
latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None,
self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None,
self._namespace)
num_layer = len(layers) #Number of layers in the MLP
MF_Embedding_User = fluid.embedding(
input=inputs[0],
size=[num_users, latent_dim],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
MF_Embedding_Item = fluid.embedding(
input=inputs[1],
size=[num_items, latent_dim],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_User = fluid.embedding(
input=inputs[0],
size=[num_users, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_Item = fluid.embedding(
input=inputs[1],
size=[num_items, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
# MF part
mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1)
mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1)
mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent)
mf_vector = fluid.layers.elementwise_mul(mf_user_latent,
mf_item_latent)
# MLP part
# The 0-th layer is the concatenation of embedding layers
mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1)
mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1)
mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1)
mlp_vector = fluid.layers.concat(
input=[mlp_user_latent, mlp_item_latent], axis=-1)
for i in range(1, num_layer):
mlp_vector = fluid.layers.fc(input=mlp_vector,
size=layers[i],
act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)),
name='layer_' + str(i))
mlp_vector = fluid.layers.fc(
input=mlp_vector,
size=layers[i],
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
name='layer_' + str(i))
# Concatenate MF and MLP parts
predict_vector = fluid.layers.concat(input=[mf_vector, mlp_vector], axis=-1)
predict_vector = fluid.layers.concat(
input=[mf_vector, mlp_vector], axis=-1)
# Final prediction layer
prediction = fluid.layers.fc(input=predict_vector,
size=1,
act='sigmoid',
param_attr=fluid.initializer.MSRAInitializer(uniform=True),
name='prediction')
prediction = fluid.layers.fc(
input=predict_vector,
size=1,
act='sigmoid',
param_attr=fluid.initializer.MSRAInitializer(uniform=True),
name='prediction')
if is_infer:
self._infer_results["prediction"] = prediction
return
cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32'))
cost = fluid.layers.log_loss(
input=prediction,
label=fluid.layers.cast(
x=inputs[2], dtype='float32'))
avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost
self._metrics["cost"] = avg_cost
def train_net(self):
input_data = self.input_data()
self.net(input_data)
def infer_net(self):
self._infer_data_var = self.input_data(is_infer=True)
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True)
......@@ -33,7 +33,9 @@ class EvaluateReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features = line.strip().split(',')
feature_name = ["user_input", "item_input"]
yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]])
yield zip(feature_name,
[[int(features[0])]] + [[int(features[1])]])
return reader
......@@ -33,10 +33,9 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features = line.strip().split(',')
feature_name = ["user_input", "item_input", "label"]
yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]] + [[int(features[2])]])
yield zip(feature_name, [[int(features[0])]] +
[[int(features[1])]] + [[int(features[2])]])
return reader
......@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
| MOVIELENS | NCF | 0.688 | -- |
| -- | Youtube | -- | -- |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
......@@ -79,9 +79,12 @@ class Model(ModelBase):
return correct
def train(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None,
self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None,
self._namespace)
emb_shape = [vocab_size, emb_dim]
self.user_encoder = GrnnEncoder()
......@@ -131,24 +134,34 @@ class Model(ModelBase):
self.train()
def infer(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None,
self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None,
self._namespace)
user_data = fluid.data(
name="user", shape=[None, 1], dtype="int64", lod_level=1)
all_item_data = fluid.data(
name="all_item", shape=[None, vocab_size], dtype="int64")
pos_label = fluid.data(name="pos_label", shape=[None, 1], dtype="int64")
pos_label = fluid.data(
name="pos_label", shape=[None, 1], dtype="int64")
self._infer_data_var = [user_data, all_item_data, pos_label]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
user_emb = fluid.embedding(
input=user_data, size=[vocab_size, emb_dim], param_attr="emb.item")
all_item_emb = fluid.embedding(
input=all_item_data, size=[vocab_size, emb_dim], param_attr="emb.item")
all_item_emb_re = fluid.layers.reshape(x=all_item_emb, shape=[-1, emb_dim])
input=all_item_data,
size=[vocab_size, emb_dim],
param_attr="emb.item")
all_item_emb_re = fluid.layers.reshape(
x=all_item_emb, shape=[-1, emb_dim])
user_encoder = GrnnEncoder()
user_enc = user_encoder.forward(user_emb)
......@@ -156,7 +169,8 @@ class Model(ModelBase):
size=hidden_size,
param_attr='user.w',
bias_attr="user.b")
user_exp = fluid.layers.expand(x=user_hid, expand_times=[1, vocab_size])
user_exp = fluid.layers.expand(
x=user_hid, expand_times=[1, vocab_size])
user_re = fluid.layers.reshape(x=user_exp, shape=[-1, hidden_size])
all_item_hid = fluid.layers.fc(input=all_item_emb_re,
......
......@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters")
self.vocab_size = envs.get_global_env("vocab_size", 10,
"train.model.hyper_parameters")
def generate_sample(self, line):
"""
......@@ -39,6 +40,9 @@ class EvaluateReader(Reader):
src = conv_ids[:boundary]
pos_tgt = [conv_ids[boundary]]
feature_name = ["user", "all_item", "p_item"]
yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt])
yield zip(
feature_name,
[src] + [np.arange(self.vocab_size).astype("int64").tolist()] +
[pos_tgt])
return reader
......@@ -24,46 +24,57 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input(self):
neg_num = int(envs.get_global_env(
"hyper_parameters.neg_num", None, self._namespace))
self.input_word = fluid.data(name="input_word", shape=[
None, 1], dtype='int64')
self.true_word = fluid.data(name='true_label', shape=[
None, 1], dtype='int64')
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
self.input_word = fluid.data(
name="input_word", shape=[None, 1], dtype='int64')
self.true_word = fluid.data(
name='true_label', shape=[None, 1], dtype='int64')
self._data_var.append(self.input_word)
self._data_var.append(self.true_word)
with_shuffle_batch = bool(int(envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, self._namespace)))
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
if not with_shuffle_batch:
self.neg_word = fluid.data(name="neg_label", shape=[
None, neg_num], dtype='int64')
self.neg_word = fluid.data(
name="neg_label", shape=[None, neg_num], dtype='int64')
self._data_var.append(self.neg_word)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
neg_num = int(envs.get_global_env(
"hyper_parameters.neg_num", None, self._namespace))
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
with_shuffle_batch = bool(int(envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, self._namespace)))
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
def embedding_layer(input, table_name, emb_dim, initializer_instance=None, squeeze=False):
def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None,
squeeze=False):
emb = fluid.embedding(
input=input,
is_sparse=True,
is_distributed=is_distributed,
size=[sparse_feature_number, emb_dim],
param_attr=fluid.ParamAttr(
name=table_name,
initializer=initializer_instance),
)
name=table_name, initializer=initializer_instance), )
if squeeze:
return fluid.layers.squeeze(input=emb, axes=[1])
else:
......@@ -73,35 +84,38 @@ class Model(ModelBase):
emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer(
self.input_word, "emb", sparse_feature_dim, emb_initializer, True)
true_emb_w = embedding_layer(
self.true_word, "emb_w", sparse_feature_dim, emb_w_initializer, True)
true_emb_b = embedding_layer(
self.true_word, "emb_b", 1, emb_w_initializer, True)
input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim,
emb_initializer, True)
true_emb_w = embedding_layer(self.true_word, "emb_w",
sparse_feature_dim, emb_w_initializer,
True)
true_emb_b = embedding_layer(self.true_word, "emb_b", 1,
emb_w_initializer, True)
if with_shuffle_batch:
neg_emb_w_list = []
for i in range(neg_num):
neg_emb_w_list.append(fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim])
neg_emb_b_list = []
for i in range(neg_num):
neg_emb_b_list.append(fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
else:
neg_emb_w = embedding_layer(
self.neg_word, "emb_w", sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer(
self.neg_word, "emb_b", 1, emb_w_initializer)
neg_emb_w = embedding_layer(self.neg_word, "emb_w",
sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1,
emb_w_initializer)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
......@@ -117,7 +131,8 @@ class Model(ModelBase):
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.elementwise_add(
fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]),
fluid.layers.reshape(
neg_matmul, shape=[-1, neg_num]),
neg_emb_b_vec)
label_ones = fluid.layers.fill_constant_batch_size_like(
......@@ -136,9 +151,17 @@ class Model(ModelBase):
neg_xent, dim=1))
self.avg_cost = fluid.layers.reduce_mean(cost)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
......@@ -155,12 +178,12 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate", None, self._namespace)
decay_steps = envs.get_global_env(
"hyper_parameters.decay_steps", None, self._namespace)
decay_rate = envs.get_global_env(
"hyper_parameters.decay_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=learning_rate,
......@@ -180,11 +203,15 @@ class Model(ModelBase):
name="analogy_c", shape=[None], dtype='int64')
self.analogy_d = fluid.data(
name="analogy_d", shape=[None], dtype='int64')
self._infer_data_var = [self.analogy_a,
self.analogy_b, self.analogy_c, self.analogy_d]
self._infer_data_var = [
self.analogy_a, self.analogy_b, self.analogy_c, self.analogy_d
]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
sparse_feature_dim = envs.get_global_env(
......@@ -216,18 +243,28 @@ class Model(ModelBase):
dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(fluid.layers.unsqueeze(
self.analogy_d, axes=[1]), expand_times=[1, 4])
label = fluid.layers.expand(
fluid.layers.unsqueeze(
self.analogy_d, axes=[1]),
expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(
input=fluid.layers.cast(fluid.layers.equal(pred_idx, label), dtype='float32'))
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
......
......@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir test_data/
rm -rf raw_data
......@@ -49,8 +49,7 @@ def parse_args():
'--file_nums',
type=int,
default=1024,
help="re-split input corpus file nums"
)
help="re-split input corpus file nums")
parser.add_argument(
'--downsample',
type=float,
......@@ -137,9 +136,11 @@ def filter_corpus(args):
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
with io.open(args.output_corpus_dir + '/convert_' + file + '.csv', "w") as wf:
with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
"w") as wf:
with io.open(
args.input_corpus_dir + '/' + file, encoding='utf-8') as rf:
args.input_corpus_dir + '/' + file,
encoding='utf-8') as rf:
print(args.input_corpus_dir + '/' + file)
for line in rf:
signal = False
......@@ -154,9 +155,9 @@ def filter_corpus(args):
count_w = id_counts[idx]
corpus_size = word_all_count
keep_prob = (
math.sqrt(count_w /
(args.downsample * corpus_size)) + 1
) * (args.downsample * corpus_size) / count_w
math.sqrt(count_w /
(args.downsample * corpus_size)) + 1
) * (args.downsample * corpus_size) / count_w
r_value = random.random()
if r_value > keep_prob:
continue
......@@ -182,7 +183,8 @@ def build_dict(args):
for file in os.listdir(args.build_dict_corpus_dir):
with io.open(
args.build_dict_corpus_dir + "/" + file, encoding='utf-8') as f:
args.build_dict_corpus_dir + "/" + file,
encoding='utf-8') as f:
print("build dict : ", args.build_dict_corpus_dir + "/" + file)
for line in f:
line = text_strip(line)
......@@ -232,7 +234,8 @@ def data_split(args):
for i in range(1, num + 1):
with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))]
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
len(contents))]
for line in data:
fout.write(line)
......
......@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader")
dict_path = envs.get_global_env("word_id_dict_path", None,
"evaluate.reader")
self.word_to_id = dict()
self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
......@@ -68,14 +69,17 @@ class EvaluateReader(Reader):
a unicode string - a space-delimited sequence of words.
"""
return u" ".join([
word if word in original_vocab else u"<UNK>" for word in line.split()
word if word in original_vocab else u"<UNK>"
for word in line.split()
])
def generate_sample(self, line):
def reader():
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]),
('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])]
yield [('analogy_a', [self.word_to_id[features[0]]]),
('analogy_b', [self.word_to_id[features[1]]]),
('analogy_c', [self.word_to_id[features[2]]]),
('analogy_d', [self.word_to_id[features[3]]])]
return reader
......@@ -40,10 +40,14 @@ class NumpyRandomInt(object):
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader")
self.window_size = envs.get_global_env("hyper_parameters.window_size", None, "train.model")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, "train.model")
self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch", None, "train.model")
dict_path = envs.get_global_env("word_count_dict_path", None,
"train.reader")
self.window_size = envs.get_global_env("hyper_parameters.window_size",
None, "train.model")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None,
"train.model")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, "train.model")
self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.cs = None
......@@ -81,13 +85,15 @@ class TrainReader(Reader):
def reader():
word_ids = [w for w in line.split()]
for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words(
word_ids, idx)
context_word_ids = self.get_context_words(word_ids, idx)
for context_id in context_word_ids:
output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])]
output = [('input_word', [int(target_id)]),
('true_label', [int(context_id)])]
if not self.with_shuffle_batch:
neg_array = self.cs.searchsorted(np.random.sample(self.neg_num))
output += [('neg_label', [int(str(i)) for i in neg_array])]
neg_array = self.cs.searchsorted(
np.random.sample(self.neg_num))
output += [('neg_label',
[int(str(i)) for i in neg_array])]
yield output
return reader
......@@ -25,14 +25,20 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input_data(self, is_infer=False):
watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, self._namespace)
search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, self._namespace)
other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, self._namespace)
watch_vec = fluid.data(name="watch_vec", shape=[None, watch_vec_size], dtype="float32")
search_vec = fluid.data(name="search_vec", shape=[None, search_vec_size], dtype="float32")
other_feat = fluid.data(name="other_feat", shape=[None, other_feat_size], dtype="float32")
watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size",
None, self._namespace)
search_vec_size = envs.get_global_env(
"hyper_parameters.search_vec_size", None, self._namespace)
other_feat_size = envs.get_global_env(
"hyper_parameters.other_feat_size", None, self._namespace)
watch_vec = fluid.data(
name="watch_vec", shape=[None, watch_vec_size], dtype="float32")
search_vec = fluid.data(
name="search_vec", shape=[None, search_vec_size], dtype="float32")
other_feat = fluid.data(
name="other_feat", shape=[None, other_feat_size], dtype="float32")
label = fluid.data(name="label", shape=[None, 1], dtype="int64")
inputs = [watch_vec] + [search_vec] + [other_feat] + [label]
self._data_var = inputs
......@@ -41,27 +47,32 @@ class Model(ModelBase):
def fc(self, tag, data, out_dim, active='relu'):
init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1])
scales = 1.0 / np.sqrt(data.shape[1])
if tag == 'l4':
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales))
p_attr = fluid.param_attr.ParamAttr(
name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
else:
p_attr = None
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data,
size=out_dim,
act=active,
param_attr=p_attr,
bias_attr =b_attr,
name=tag)
size=out_dim,
act=active,
param_attr=p_attr,
bias_attr=b_attr,
name=tag)
return out
def net(self, inputs):
output_size = envs.get_global_env("hyper_parameters.output_size", None, self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace)
output_size = envs.get_global_env("hyper_parameters.output_size", None,
self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None,
self._namespace)
concat_feats = fluid.layers.concat(input=inputs[:-1], axis=-1)
l1 = self.fc('l1', concat_feats, layers[0], 'relu')
......
......@@ -21,10 +21,14 @@ import numpy as np
class TrainReader(Reader):
def init(self):
self.watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, "train.model")
self.search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, "train.model")
self.other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, "train.model")
self.output_size = envs.get_global_env("hyper_parameters.output_size", None, "train.model")
self.watch_vec_size = envs.get_global_env(
"hyper_parameters.watch_vec_size", None, "train.model")
self.search_vec_size = envs.get_global_env(
"hyper_parameters.search_vec_size", None, "train.model")
self.other_feat_size = envs.get_global_env(
"hyper_parameters.other_feat_size", None, "train.model")
self.output_size = envs.get_global_env("hyper_parameters.output_size",
None, "train.model")
def generate_sample(self, line):
"""
......@@ -35,13 +39,12 @@ class TrainReader(Reader):
"""
This function needs to be implemented by the user, based on data format
"""
feature_name = ["watch_vec", "search_vec", "other_feat", "label"]
yield zip(feature_name, [np.random.rand(self.watch_vec_size).tolist()] +
[np.random.rand(self.search_vec_size).tolist()] +
[np.random.rand(self.other_feat_size).tolist()] +
[[np.random.randint(self.output_size)]] )
yield zip(feature_name,
[np.random.rand(self.watch_vec_size).tolist()] +
[np.random.rand(self.search_vec_size).tolist()] +
[np.random.rand(self.other_feat_size).tolist()] +
[[np.random.randint(self.output_size)]])
return reader
......@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效
- 如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。
- 训练数据如何组织?答:tdm的训练数据主要为:`user/query emb``item`的正样本,`item`需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的`tdm-sampler op`完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。
- 大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。
3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
\ No newline at end of file
3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -25,38 +25,38 @@ class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
# tree meta hyper parameters
self.max_layers = envs.get_global_env(
"tree_parameters.max_layers", 4, self._namespace)
self.node_nums = envs.get_global_env(
"tree_parameters.node_nums", 26, self._namespace)
self.max_layers = envs.get_global_env("tree_parameters.max_layers", 4,
self._namespace)
self.node_nums = envs.get_global_env("tree_parameters.node_nums", 26,
self._namespace)
self.leaf_node_nums = envs.get_global_env(
"tree_parameters.leaf_node_nums", 13, self._namespace)
self.output_positive = envs.get_global_env(
"tree_parameters.output_positive", True, self._namespace)
self.layer_node_num_list = envs.get_global_env(
"tree_parameters.layer_node_num_list", [
2, 4, 7, 12], self._namespace)
self.child_nums = envs.get_global_env(
"tree_parameters.child_nums", 2, self._namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", None, "train.startup")
"tree_parameters.layer_node_num_list", [2, 4, 7,
12], self._namespace)
self.child_nums = envs.get_global_env("tree_parameters.child_nums", 2,
self._namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path",
None, "train.startup")
# model training hyper parameter
self.node_emb_size = envs.get_global_env(
"hyper_parameters.node_emb_size", 64, self._namespace)
self.input_emb_size = envs.get_global_env(
"hyper_parameters.input_emb_size", 768, self._namespace)
self.act = envs.get_global_env(
"hyper_parameters.act", "tanh", self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "tanh",
self._namespace)
self.neg_sampling_list = envs.get_global_env(
"hyper_parameters.neg_sampling_list", [
1, 2, 3, 4], self._namespace)
"hyper_parameters.neg_sampling_list", [1, 2, 3,
4], self._namespace)
# model infer hyper parameter
self.topK = envs.get_global_env(
"hyper_parameters.node_nums", 1, self._namespace)
self.batch_size = envs.get_global_env(
"batch_size", 1, "evaluate.reader")
self.topK = envs.get_global_env("hyper_parameters.node_nums", 1,
self._namespace)
self.batch_size = envs.get_global_env("batch_size", 1,
"evaluate.reader")
def train_net(self):
self.train_input()
......@@ -76,21 +76,22 @@ class Model(ModelBase):
input_emb = fluid.data(
name="input_emb",
shape=[None, self.input_emb_size],
dtype="float32",
)
dtype="float32", )
self._data_var.append(input_emb)
item_label = fluid.data(
name="item_label",
shape=[None, 1],
dtype="int64",
)
dtype="int64", )
self._data_var.append(item_label)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def tdm_net(self):
"""
......@@ -116,8 +117,7 @@ class Model(ModelBase):
output_list=True,
seed=0,
tree_dtype='int64',
dtype='int64'
)
dtype='int64')
# 查表得到每个节点的Embedding
sample_nodes_emb = [
......@@ -125,35 +125,34 @@ class Model(ModelBase):
input=sample_nodes[i],
is_sparse=True,
size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr(
name="TDM_Tree_Emb")
) for i in range(self.max_layers)
param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
for i in range(self.max_layers)
]
# 此处进行Reshape是为了之后层次化的分类器训练
sample_nodes_emb = [
fluid.layers.reshape(sample_nodes_emb[i],
[-1, self.neg_sampling_list[i] +
self.output_positive, self.node_emb_size]
) for i in range(self.max_layers)
fluid.layers.reshape(sample_nodes_emb[i], [
-1, self.neg_sampling_list[i] + self.output_positive,
self.node_emb_size
]) for i in range(self.max_layers)
]
# 对输入的input_emb进行转换,使其维度与node_emb维度一致
input_trans_emb = self.input_trans_layer(input_emb)
# 分类器的主体网络,分别训练不同层次的分类器
layer_classifier_res = self.classifier_layer(
input_trans_emb, sample_nodes_emb)
layer_classifier_res = self.classifier_layer(input_trans_emb,
sample_nodes_emb)
# 最后的概率判别FC,将所有层次的node分类结果放到一起以相同的标准进行判别
# 考虑到树极大可能不平衡,有些item不在最后一层,所以需要这样的机制保证每个item都有机会被召回
tdm_fc = fluid.layers.fc(input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
tdm_fc = fluid.layers.fc(
input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
# 将loss打平,放到一起计算整体网络的loss
tdm_fc_re = fluid.layers.reshape(tdm_fc, [-1, 2])
......@@ -202,7 +201,7 @@ class Model(ModelBase):
def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self._predict,
label=self.mask_label,
num_thresholds=2 ** 12,
num_thresholds=2**12,
slide_steps=20)
self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc
......@@ -218,8 +217,7 @@ class Model(ModelBase):
size=self.node_emb_size,
act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"),
)
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
# 将input_emb映射到各个不同层次的向量表示空间
input_layer_fc_out = [
......@@ -229,8 +227,9 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)),
) for i in range(self.max_layers)
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias." + str(i)), )
for i in range(self.max_layers)
]
return input_layer_fc_out
......@@ -246,20 +245,22 @@ class Model(ModelBase):
input_layer_unsequeeze, expand_times=[1, node.shape[1], 1])
else:
input_layer_expand = fluid.layers.expand(
input_layer_unsequeeze, expand_times=[1, node[layer_idx].shape[1], 1])
input_layer_unsequeeze,
expand_times=[1, node[layer_idx].shape[1], 1])
return input_layer_expand
def classifier_layer(self, input, node):
# 扩展input,使维度与node匹配
input_expand = [
self._expand_layer(input[i], node, i) for i in range(self.max_layers)
self._expand_layer(input[i], node, i)
for i in range(self.max_layers)
]
# 将input_emb与node_emb concat到一起过分类器FC
input_node_concat = [
fluid.layers.concat(
input=[input_expand[i], node[i]],
axis=2) for i in range(self.max_layers)
input=[input_expand[i], node[i]], axis=2)
for i in range(self.max_layers)
]
hidden_states_fc = [
fluid.layers.fc(
......@@ -269,8 +270,8 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i))
) for i in range(self.max_layers)
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)))
for i in range(self.max_layers)
]
# 如果将所有层次的node放到一起计算loss,则需要在此处concat
......@@ -285,12 +286,14 @@ class Model(ModelBase):
input_emb = fluid.layers.data(
name="input_emb",
shape=[self.input_emb_size],
dtype="float32",
)
dtype="float32", )
self._infer_data_var.append(input_emb)
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_layer_list(self):
"""get layer list from layer_list.txt"""
......@@ -318,10 +321,12 @@ class Model(ModelBase):
node_list = []
mask_list = []
for id in first_layer_node:
node_list.append(fluid.layers.fill_constant(
[self.batch_size, 1], value=int(id), dtype='int64'))
mask_list.append(fluid.layers.fill_constant(
[self.batch_size, 1], value=0, dtype='int64'))
node_list.append(
fluid.layers.fill_constant(
[self.batch_size, 1], value=int(id), dtype='int64'))
mask_list.append(
fluid.layers.fill_constant(
[self.batch_size, 1], value=0, dtype='int64'))
self.first_layer_node = fluid.layers.concat(node_list, axis=1)
self.first_layer_node_mask = fluid.layers.concat(mask_list, axis=1)
......@@ -359,28 +364,26 @@ class Model(ModelBase):
size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
input_fc_out = self.layer_fc_infer(
input_trans_emb, layer_idx)
input_fc_out = self.layer_fc_infer(input_trans_emb, layer_idx)
# 过每一层的分类器
layer_classifier_res = self.classifier_layer_infer(input_fc_out,
node_emb,
layer_idx)
layer_classifier_res = self.classifier_layer_infer(
input_fc_out, node_emb, layer_idx)
# 过最终的判别分类器
tdm_fc = fluid.layers.fc(input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
tdm_fc = fluid.layers.fc(
input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
prob = fluid.layers.softmax(tdm_fc)
positive_prob = fluid.layers.slice(
prob, axes=[2], starts=[1], ends=[2])
prob_re = fluid.layers.reshape(
positive_prob, [-1, current_layer_node_num])
prob_re = fluid.layers.reshape(positive_prob,
[-1, current_layer_node_num])
# 过滤掉padding产生的无效节点(node_id=0)
node_zero_mask = fluid.layers.cast(current_layer_node, 'bool')
......@@ -395,11 +398,11 @@ class Model(ModelBase):
# index_sample op根据下标索引tensor对应位置的值
# 若paddle版本>2.0,调用方式为paddle.index_sample
top_node = fluid.contrib.layers.index_sample(
current_layer_node, topk_i)
top_node = fluid.contrib.layers.index_sample(current_layer_node,
topk_i)
prob_re_mask = prob_re * current_layer_node_mask # 过滤掉非叶子节点
topk_value = fluid.contrib.layers.index_sample(
prob_re_mask, topk_i)
topk_value = fluid.contrib.layers.index_sample(prob_re_mask,
topk_i)
node_score.append(topk_value)
node_list.append(top_node)
......@@ -424,7 +427,8 @@ class Model(ModelBase):
res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1])
# 利用Tree_info信息,将node_id转换为item_id
tree_info = fluid.default_main_program().global_block().var("TDM_Tree_Info")
tree_info = fluid.default_main_program().global_block().var(
"TDM_Tree_Info")
res_node_emb = fluid.layers.gather_nd(tree_info, res_node)
res_item = fluid.layers.slice(
......@@ -442,8 +446,7 @@ class Model(ModelBase):
size=self.node_emb_size,
act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"),
)
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
return input_fc_out
def layer_fc_infer(self, input_fc_out, layer_idx):
......@@ -458,8 +461,7 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias." + str(layer_idx)),
)
name="trans.layer_fc.bias." + str(layer_idx)), )
return input_layer_fc_out
def classifier_layer_infer(self, input, node, layer_idx):
......@@ -480,5 +482,6 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx)))
bias_attr=fluid.ParamAttr(
name="cls.concat_fc.bias." + str(layer_idx)))
return hidden_states_fc
1,2
3,4,5,6
7,8,9,10,11,12,13
14,15,16,17,18,19,20,21,22,23,24,25
\ No newline at end of file
14,15,16,17,18,19,20,21,22,23,24,25
......@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
engines = {}
device = ["CPU", "GPU"]
clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"]
engine_choices = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER",
"TDM_SINGLE", "TDM_LOCAL_CLUSTER", "TDM_CLUSTER"]
engine_choices = [
"SINGLE", "LOCAL_CLUSTER", "CLUSTER", "TDM_SINGLE", "TDM_LOCAL_CLUSTER",
"TDM_CLUSTER"
]
custom_model = ['TDM']
model_name = ""
......@@ -66,7 +68,8 @@ def get_engine(args):
engine = engine.upper()
if engine not in engine_choices:
raise ValueError("train.engin can not be chosen in {}".format(engine_choices))
raise ValueError("train.engin can not be chosen in {}".format(
engine_choices))
print("engines: \n{}".format(engines))
......@@ -77,8 +80,10 @@ def get_engine(args):
def get_transpiler():
FNULL = open(os.devnull, 'w')
cmd = ["python", "-c",
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"]
cmd = [
"python", "-c",
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
proc = subprocess.Popen(cmd, stdout=FNULL, stderr=FNULL, cwd=os.getcwd())
ret = proc.wait()
if ret == -11:
......@@ -152,7 +157,8 @@ def cluster_engine(args):
update_workspace(flattens)
envs.set_runtime_environs(flattens)
print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value")))
print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"
)))
launch = ClusterEngine(None, args.model)
return launch
......@@ -163,7 +169,8 @@ def cluster_engine(args):
cluster_envs = {}
cluster_envs["train.trainer.trainer"] = trainer
cluster_envs["train.trainer.engine"] = "cluster"
cluster_envs["train.trainer.threads"] = envs.get_runtime_environ("CPU_NUM")
cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
"CPU_NUM")
cluster_envs["train.trainer.platform"] = envs.get_platform()
print("launch {} engine with cluster to with model: {}".format(
trainer, args.model))
......@@ -181,7 +188,8 @@ def cluster_engine(args):
def cluster_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model))
print("launch cluster engine with cluster to run model: {}".format(
args.model))
cluster_envs = {}
cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer"
......@@ -209,7 +217,8 @@ def local_cluster_engine(args):
cluster_envs["train.trainer.platform"] = envs.get_platform()
cluster_envs["CPU_NUM"] = "2"
print("launch {} engine with cluster to run model: {}".format(trainer, args.model))
print("launch {} engine with cluster to run model: {}".format(trainer,
args.model))
set_runtime_envs(cluster_envs, args.model)
launch = LocalClusterEngine(cluster_envs, args.model)
......@@ -217,10 +226,12 @@ def local_cluster_engine(args):
def local_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model))
print("launch cluster engine with cluster to run model: {}".format(
args.model))
from paddlerec.core.engine.local_mpi import LocalMPIEngine
print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(args.model))
print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(
args.model))
mpi = util.run_which("mpirun")
if not mpi:
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
setup for paddle-rec.
"""
......@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
import shutil
import tempfile
requires = [
"paddlepaddle == 1.7.2",
"pyyaml >= 5.1.1"
]
requires = ["paddlepaddle == 1.7.2", "pyyaml >= 5.1.1"]
about = {}
about["__title__"] = "paddle-rec"
......@@ -48,18 +43,27 @@ def build(dirname):
package_dir = os.path.dirname(os.path.abspath(__file__))
run_cmd("cp -r {}/* {}".format(package_dir, dirname))
run_cmd("mkdir {}".format(os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec")))
packages = find_packages(dirname, include=('paddlerec.*'))
package_dir = {'': dirname}
package_data = {}
models_copy = ['data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'tree/*.txt']
models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt'
]
engine_copy = ['*/*.sh']
for package in packages:
if package.startswith("paddlerec.models."):
......@@ -80,8 +84,7 @@ def build(dirname):
package_data=package_data,
python_requires=">=2.7",
install_requires=requires,
zip_safe=False
)
zip_safe=False)
dirname = tempfile.mkdtemp()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env bash
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=================================================
# Utils
#=================================================
set -ex
function init() {
RED='\033[0;31m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NONE='\033[0m'
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
}
function check_style() {
set -e
export PATH=/usr/bin:$PATH
pre-commit install
if ! pre-commit run -a; then
git diff
exit 1
fi
exit 0
}
function main() {
local CMD=$1
init
case $CMD in
check_style)
check_style
;;
*)
echo "build failed"
exit 1
;;
esac
echo "check_style finished as expected"
}
main $@
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import io, re
import sys, os
import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
LANG_COMMENT_MARK = None
NEW_LINE_MARK = None
COPYRIGHT_HEADER = None
if platform.system() == "Windows":
NEW_LINE_MARK = "\r\n"
else:
NEW_LINE_MARK = '\n'
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
date, err = process.communicate()
date = date.decode("utf-8").rstrip("\n")
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
def generate_copyright(template, lang='C'):
if lang == 'Python':
LANG_COMMENT_MARK = '#'
else:
LANG_COMMENT_MARK = "//"
lines = template.split(NEW_LINE_MARK)
BLANK = " "
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
for lino, line in enumerate(lines):
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
if len(line) == 0:
BLANK = ""
else:
BLANK = " "
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
return ans + "\n"
def lang_type(filename):
if filename.endswith(".py"):
return "Python"
elif filename.endswith(".h"):
return "C"
elif filename.endswith(".c"):
return "C"
elif filename.endswith(".hpp"):
return "C"
elif filename.endswith(".cc"):
return "C"
elif filename.endswith(".cpp"):
return "C"
elif filename.endswith(".cu"):
return "C"
elif filename.endswith(".cuh"):
return "C"
elif filename.endswith(".go"):
return "C"
elif filename.endswith(".proto"):
return "C"
else:
print("Unsupported filetype %s", filename)
exit(0)
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
def main(argv=None):
parser = argparse.ArgumentParser(
description='Checker for copyright declaration.')
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
fd = io.open(filename, encoding="utf-8")
first_line = fd.readline()
second_line = fd.readline()
if "COPYRIGHT (C)" in first_line.upper(): continue
if first_line.startswith("#!") or PYTHON_ENCODE.match(
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
continue
original_contents = io.open(filename, encoding="utf-8").read()
new_contents = generate_copyright(
COPYRIGHT, lang_type(filename)) + original_contents
print('Auto Insert Copyright Header {}'.format(filename))
retv = 1
with io.open(filename, 'w') as output_file:
output_file.write(new_contents)
return retv
if __name__ == '__main__':
exit(main())
#!/bin/bash
TOTAL_ERRORS=0
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export PYTHONPATH=$DIR:$PYTHONPATH
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
pylint --disable=all --load-plugins=docstring_checker \
--enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done
exit $TOTAL_ERRORS
#For now, just warning:
#exit 0
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册