未验证 提交 633ecc2c 编写于 作者: D Dong Daxiang 提交者: GitHub

Merge pull request #9 from seiriosPlus/travis

Travis
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: v1.0.1
hooks:
- id: remove-crlf
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- id: end-of-file-fixer
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
language: generic
sudo: required
dist: trusty
services:
- docker
os:
- linux
env:
- JOB=check_style
before_install:
# For pylint dockstring checker
- sudo pip install pylint pytest astroid isort pre-commit
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- "travis_wait 30 sleep 1800 &"
- |
# 43min timeout
tools/build_script.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
notifications:
email:
on_success: change
on_failure: always
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -27,6 +27,7 @@ from paddlerec.core.utils import envs
class ClusterEngine(Engine):
def __init_impl__(self):
abs_dir = os.path.dirname(os.path.abspath(__file__))
backend = envs.get_runtime_environ("engine_backend")
if backend == "PaddleCloud":
self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
......@@ -57,4 +58,5 @@ class ClusterEngine(Engine):
self.start_worker_procs()
else:
raise ValueError("role {} error, must in MASTER/WORKER".format(role))
raise ValueError("role {} error, must in MASTER/WORKER".format(
role))
......@@ -46,10 +46,13 @@ class LocalClusterEngine(Engine):
ports.append(new_port)
break
user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
user_endpoints_ips = [x.split(":")[0]
for x in user_endpoints.split(",")]
user_endpoints_port = [x.split(":")[1]
for x in user_endpoints.split(",")]
user_endpoints_ips = [
x.split(":")[0] for x in user_endpoints.split(",")
]
user_endpoints_port = [
x.split(":")[1] for x in user_endpoints.split(",")
]
factory = "paddlerec.core.factory"
cmd = [sys.executable, "-u", "-m", factory, self.trainer]
......@@ -97,8 +100,10 @@ class LocalClusterEngine(Engine):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].terminate()
print("all workers already completed, you can view logs under the `{}` directory".format(logs_dir),
file=sys.stderr)
print(
"all workers already completed, you can view logs under the `{}` directory".
format(logs_dir),
file=sys.stderr)
def run(self):
self.start_procs()
......@@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine
class LocalMPIEngine(Engine):
def start_procs(self):
logs_dir = self.envs["log_dir"]
default_env = os.environ.copy()
current_env = copy.copy(default_env)
current_env.pop("http_proxy", None)
......@@ -42,7 +41,8 @@ class LocalMPIEngine(Engine):
os.system("mkdir -p {}".format(logs_dir))
fn = open("%s/job.log" % logs_dir, "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
proc = subprocess.Popen(
cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
else:
proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd())
procs.append(proc)
......@@ -51,7 +51,9 @@ class LocalMPIEngine(Engine):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].wait()
print("all workers and parameter servers already completed", file=sys.stderr)
print(
"all workers and parameter servers already completed",
file=sys.stderr)
def run(self):
self.start_procs()
......@@ -19,24 +19,23 @@ import yaml
from paddlerec.core.utils import envs
trainer_abs = os.path.join(os.path.dirname(
os.path.abspath(__file__)), "trainers")
trainer_abs = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "trainers")
trainers = {}
def trainer_registry():
trainers["SingleTrainer"] = os.path.join(
trainer_abs, "single_trainer.py")
trainers["ClusterTrainer"] = os.path.join(
trainer_abs, "cluster_trainer.py")
trainers["CtrCodingTrainer"] = os.path.join(
trainer_abs, "ctr_coding_trainer.py")
trainers["CtrModulTrainer"] = os.path.join(
trainer_abs, "ctr_modul_trainer.py")
trainers["TDMSingleTrainer"] = os.path.join(
trainer_abs, "tdm_single_trainer.py")
trainers["TDMClusterTrainer"] = os.path.join(
trainer_abs, "tdm_cluster_trainer.py")
trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py")
trainers["ClusterTrainer"] = os.path.join(trainer_abs,
"cluster_trainer.py")
trainers["CtrCodingTrainer"] = os.path.join(trainer_abs,
"ctr_coding_trainer.py")
trainers["CtrModulTrainer"] = os.path.join(trainer_abs,
"ctr_modul_trainer.py")
trainers["TDMSingleTrainer"] = os.path.join(trainer_abs,
"tdm_single_trainer.py")
trainers["TDMClusterTrainer"] = os.path.join(trainer_abs,
"tdm_cluster_trainer.py")
trainer_registry()
......@@ -55,8 +54,8 @@ class TrainerFactory(object):
if trainer_abs is None:
if not os.path.isfile(train_mode):
raise IOError(
"trainer {} can not be recognized".format(train_mode))
raise IOError("trainer {} can not be recognized".format(
train_mode))
trainer_abs = train_mode
train_mode = "UserDefineTrainer"
......
......@@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric
class AUCMetric(Metric):
"""
Metric For Paddle Model
Metric For Fluid Model
"""
def __init__(self, config, fleet):
......@@ -83,7 +83,8 @@ class AUCMetric(Metric):
if scope.find_var(metric_item['var'].name) is None:
result[metric_name] = None
continue
result[metric_name] = self.get_metric(scope, metric_item['var'].name)
result[metric_name] = self.get_metric(scope,
metric_item['var'].name)
return result
def calculate_auc(self, global_pos, global_neg):
......@@ -178,14 +179,18 @@ class AUCMetric(Metric):
self._result['mean_q'] = 0
return self._result
if 'stat_pos' in result and 'stat_neg' in result:
result['auc'] = self.calculate_auc(result['stat_pos'], result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'], result['stat_neg'])
result['auc'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
if 'pos_ins_num' in result:
result['actual_ctr'] = result['pos_ins_num'] / result['total_ins_num']
result['actual_ctr'] = result['pos_ins_num'] / result[
'total_ins_num']
if 'abserr' in result:
result['mae'] = result['abserr'] / result['total_ins_num']
if 'sqrerr' in result:
result['rmse'] = math.sqrt(result['sqrerr'] / result['total_ins_num'])
result['rmse'] = math.sqrt(result['sqrerr'] /
result['total_ins_num'])
if 'prob' in result:
result['predict_ctr'] = result['prob'] / result['total_ins_num']
if abs(result['predict_ctr']) > 1e-6:
......
......@@ -20,7 +20,7 @@ from paddlerec.core.utils import envs
class Model(object):
"""R
"""Base Model
"""
__metaclass__ = abc.ABCMeta
......@@ -39,32 +39,43 @@ class Model(object):
self._platform = envs.get_platform()
def _init_slots(self):
sparse_slots = envs.get_global_env("sparse_slots", None, "train.reader")
sparse_slots = envs.get_global_env("sparse_slots", None,
"train.reader")
dense_slots = envs.get_global_env("dense_slots", None, "train.reader")
if sparse_slots is not None or dense_slots is not None:
sparse_slots = sparse_slots.strip().split(" ")
dense_slots = dense_slots.strip().split(" ")
dense_slots_shape = [[int(j) for j in i.split(":")[1].strip("[]").split(",")] for i in dense_slots]
dense_slots_shape = [[
int(j) for j in i.split(":")[1].strip("[]").split(",")
] for i in dense_slots]
dense_slots = [i.split(":")[0] for i in dense_slots]
self._dense_data_var = []
for i in range(len(dense_slots)):
l = fluid.layers.data(name=dense_slots[i], shape=dense_slots_shape[i], dtype="float32")
l = fluid.layers.data(
name=dense_slots[i],
shape=dense_slots_shape[i],
dtype="float32")
self._data_var.append(l)
self._dense_data_var.append(l)
self._sparse_data_var = []
for name in sparse_slots:
l = fluid.layers.data(name=name, shape=[1], lod_level=1, dtype="int64")
l = fluid.layers.data(
name=name, shape=[1], lod_level=1, dtype="int64")
self._data_var.append(l)
self._sparse_data_var.append(l)
dataset_class = envs.get_global_env("dataset_class", None, "train.reader")
dataset_class = envs.get_global_env("dataset_class", None,
"train.reader")
if dataset_class == "DataLoader":
self._init_dataloader()
def _init_dataloader(self):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_inputs(self):
return self._data_var
......@@ -96,8 +107,8 @@ class Model(object):
"configured optimizer can only supported SGD/Adam/Adagrad")
if name == "SGD":
reg = envs.get_global_env(
"hyper_parameters.reg", 0.0001, self._namespace)
reg = envs.get_global_env("hyper_parameters.reg", 0.0001,
self._namespace)
optimizer_i = fluid.optimizer.SGD(
lr, regularization=fluid.regularizer.L2DecayRegularizer(reg))
elif name == "ADAM":
......@@ -111,10 +122,10 @@ class Model(object):
return optimizer_i
def optimizer(self):
learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate", None, self._namespace)
optimizer = envs.get_global_env(
"hyper_parameters.optimizer", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
self._namespace)
print(">>>>>>>>>>>.learnig rate: %s" % learning_rate)
return self._build_optimizer(optimizer, learning_rate)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -31,6 +31,7 @@ def create(config):
Model Instance
"""
model = None
if config['mode'] == 'fluid':
model = YamlModel(config)
model.train_net()
......@@ -50,7 +51,12 @@ class YamlModel(Model):
f = open(config['layer_file'], 'r')
self._build_nodes = yaml.safe_load(f.read())
self._build_phase = ['input', 'param', 'summary', 'layer']
self._build_param = {'layer': {}, 'inner_layer': {}, 'layer_extend': {}, 'model': {}}
self._build_param = {
'layer': {},
'inner_layer': {},
'layer_extend': {},
'model': {}
}
self._inference_meta = {'dependency': {}, 'params': {}}
def train_net(self):
......@@ -76,10 +82,12 @@ class YamlModel(Model):
if self._build_nodes[phase] is None:
continue
for node in self._build_nodes[phase]:
exec("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(self._config['mode'], self._build_param)
exec ("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(
self._config['mode'], self._build_param)
self._build_param['layer'][node['name']] = layer_output
self._build_param['layer_extend'][node['name']] = extend_output
self._build_param['layer_extend'][node[
'name']] = extend_output
if extend_output is None:
continue
if 'loss' in extend_output:
......@@ -89,17 +97,24 @@ class YamlModel(Model):
self._cost += extend_output['loss']
if 'data_var' in extend_output:
self._data_var += extend_output['data_var']
if 'metric_label' in extend_output and extend_output['metric_label'] is not None:
self._metrics[extend_output['metric_label']] = extend_output['metric_dict']
if 'metric_label' in extend_output and extend_output[
'metric_label'] is not None:
self._metrics[extend_output[
'metric_label']] = extend_output['metric_dict']
if 'inference_param' in extend_output:
inference_param = extend_output['inference_param']
param_name = inference_param['name']
if param_name not in self._build_param['table']:
self._build_param['table'][param_name] = {'params': []}
table_meta = table.TableMeta.alloc_new_table(inference_param['table_id'])
self._build_param['table'][param_name]['_meta'] = table_meta
self._build_param['table'][param_name]['params'] += inference_param['params']
self._build_param['table'][param_name] = {
'params': []
}
table_meta = table.TableMeta.alloc_new_table(
inference_param['table_id'])
self._build_param['table'][param_name][
'_meta'] = table_meta
self._build_param['table'][param_name][
'params'] += inference_param['params']
pass
@classmethod
......@@ -114,20 +129,25 @@ class YamlModel(Model):
metrics = params['metrics']
for name in metrics:
model_metrics = metrics[name]
stat_var_names += [model_metrics[metric]['var'].name for metric in model_metrics]
stat_var_names += [
model_metrics[metric]['var'].name
for metric in model_metrics
]
strategy['stat_var_names'] = list(set(stat_var_names))
optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \
'(learning_rate=' + str(optimizer_conf['learning_rate']) + ')'
exec(optimizer_generator)
exec (optimizer_generator)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
return optimizer
def dump_model_program(self, path):
"""R
"""
with open(path + '/' + self._name + '_main_program.pbtxt', "w") as fout:
with open(path + '/' + self._name + '_main_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['train_program']
with open(path + '/' + self._name + '_startup_program.pbtxt', "w") as fout:
with open(path + '/' + self._name + '_startup_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['startup_program']
pass
......@@ -137,7 +157,8 @@ class YamlModel(Model):
scope = params['scope']
decay = params['decay']
for param_table in self._build_param['table']:
table_id = self._build_param['table'][param_table]['_meta']._table_id
table_id = self._build_param['table'][param_table][
'_meta']._table_id
fleet.shrink_dense_table(decay, scope=scope, table_id=table_id)
def dump_inference_program(self, inference_layer, path):
......@@ -152,17 +173,25 @@ class YamlModel(Model):
executor = params['executor']
program = self._build_param['model']['train_program']
for table_name, table in self._build_param['table'].items():
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, table['params'])
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id,
table['params'])
for infernce_item in params['inference_list']:
params_name_list = self.inference_params(infernce_item['layer_name'])
params_var_list = [program.global_block().var(i) for i in params_name_list]
params_name_list = self.inference_params(infernce_item[
'layer_name'])
params_var_list = [
program.global_block().var(i) for i in params_name_list
]
params_file_name = infernce_item['save_file_name']
with fluid.scope_guard(scope):
if params['save_combine']:
fluid.io.save_vars(executor, "./", \
program, vars=params_var_list, filename=params_file_name)
else:
fluid.io.save_vars(executor, params_file_name, program, vars=params_var_list)
fluid.io.save_vars(
executor,
params_file_name,
program,
vars=params_var_list)
def inference_params(self, inference_layer):
"""
......@@ -177,11 +206,13 @@ class YamlModel(Model):
return self._inference_meta['params'][layer]
self._inference_meta['params'][layer] = []
self._inference_meta['dependency'][layer] = self.get_dependency(self._build_param['inner_layer'], layer)
self._inference_meta['dependency'][layer] = self.get_dependency(
self._build_param['inner_layer'], layer)
for node in self._build_nodes['layer']:
if node['name'] not in self._inference_meta['dependency'][layer]:
continue
if 'inference_param' in self._build_param['layer_extend'][node['name']]:
if 'inference_param' in self._build_param['layer_extend'][node[
'name']]:
self._inference_meta['params'][layer] += \
self._build_param['layer_extend'][node['name']]['inference_param']['params']
return self._inference_meta['params'][layer]
......@@ -199,5 +230,6 @@ class YamlModel(Model):
dependencys = copy.deepcopy(layer_graph[dest_layer]['input'])
dependency_list = copy.deepcopy(dependencys)
for dependency in dependencys:
dependency_list = dependency_list + self.get_dependency(layer_graph, dependency)
dependency_list = dependency_list + self.get_dependency(
layer_graph, dependency)
return list(set(dependency_list))
......@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer
class EmbeddingFuseLayer(Layer):
"""R
"""embedding + sequence + concat
"""
def __init__(self, config):
......@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer):
show_clk.stop_gradient = True
data_var = []
for slot in self._slots:
l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1)
l = fluid.layers.data(
name=slot, shape=[1], dtype="int64", lod_level=1)
data_var.append(l)
emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \
is_sparse=True, is_distributed=True,
......@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer):
emb = fluid.layers.sequence_pool(input=emb, pool_type='sum')
emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm)
self._emb_layers.append(emb)
output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name)
output = fluid.layers.concat(
input=self._emb_layers, axis=1, name=self._name)
return output, {'data_var': data_var}
......@@ -111,7 +113,13 @@ class ParamLayer(Layer):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}}
return self._config, {
'inference_param': {
'name': 'param',
'params': [],
'table_id': self._table_id
}
}
class SummaryLayer(Layer):
......@@ -129,7 +137,13 @@ class SummaryLayer(Layer):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}}
return self._config, {
'inference_param': {
'name': 'summary',
'params': [],
'table_id': self._table_id
}
}
class NormalizationLayer(Layer):
......@@ -152,9 +166,19 @@ class NormalizationLayer(Layer):
if len(self._input) > 0:
input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1)
bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={
"batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4})
inference_param = [self._name + '.batch_size', self._name + '.batch_sum', self._name + '.batch_square_sum']
bn = fluid.layers.data_norm(
input=input_layer,
name=self._name,
epsilon=1e-4,
param_attr={
"batch_size": 1e4,
"batch_sum_default": 0.0,
"batch_square": 1e4
})
inference_param = [
self._name + '.batch_size', self._name + '.batch_sum',
self._name + '.batch_square_sum'
]
return bn, {'inference_param': {'name': 'summary', \
'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}}
......@@ -181,11 +205,13 @@ class FCLayer(Layer):
input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1)
input_coln = input_layer.shape[1]
scale = param_layer['init_range'] / (input_coln ** 0.5)
scale = param_layer['init_range'] / (input_coln**0.5)
bias = None
if self._bias:
bias = fluid.ParamAttr(learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale))
bias = fluid.ParamAttr(
learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale))
fc = fluid.layers.fc(
name=self._name,
input=input_layer,
......@@ -216,18 +242,46 @@ class LogLossLayer(Layer):
self._extend_output = {
'metric_label': self._metric_label,
'metric_dict': {
'auc': {'var': None},
'batch_auc': {'var': None},
'stat_pos': {'var': None, 'data_type': 'int64'},
'stat_neg': {'var': None, 'data_type': 'int64'},
'batch_stat_pos': {'var': None, 'data_type': 'int64'},
'batch_stat_neg': {'var': None, 'data_type': 'int64'},
'pos_ins_num': {'var': None},
'abserr': {'var': None},
'sqrerr': {'var': None},
'prob': {'var': None},
'total_ins_num': {'var': None},
'q': {'var': None}
'auc': {
'var': None
},
'batch_auc': {
'var': None
},
'stat_pos': {
'var': None,
'data_type': 'int64'
},
'stat_neg': {
'var': None,
'data_type': 'int64'
},
'batch_stat_pos': {
'var': None,
'data_type': 'int64'
},
'batch_stat_neg': {
'var': None,
'data_type': 'int64'
},
'pos_ins_num': {
'var': None
},
'abserr': {
'var': None
},
'sqrerr': {
'var': None
},
'prob': {
'var': None
},
'total_ins_num': {
'var': None
},
'q': {
'var': None
}
}
}
......@@ -236,9 +290,12 @@ class LogLossLayer(Layer):
"""
input_layer = param['layer'][self._input[0]]
label_layer = param['layer'][self._label]
output = fluid.layers.clip(input_layer, self._bound[0], self._bound[1], name=self._name)
output = fluid.layers.clip(
input_layer, self._bound[0], self._bound[1], name=self._name)
norm = fluid.layers.sigmoid(output, name=self._name)
output = fluid.layers.log_loss(norm, fluid.layers.cast(x=label_layer, dtype='float32'))
output = fluid.layers.log_loss(
norm, fluid.layers.cast(
x=label_layer, dtype='float32'))
if self._weight:
weight_layer = param['layer'][self._weight]
output = fluid.layers.elementwise_mul(output, weight_layer)
......@@ -248,7 +305,11 @@ class LogLossLayer(Layer):
# For AUC Metric
metric = self._extend_output['metric_dict']
binary_predict = fluid.layers.concat(
input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1)
input=[
fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm),
norm
],
axis=1)
metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \
metric['batch_stat_neg']['var'], metric['stat_pos']['var'],
metric['stat_neg']['var']] = \
......
......@@ -11,9 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import abc
import os
......@@ -64,7 +64,11 @@ class SlotReader(dg.MultiSlotDataGenerator):
from operator import mul
self.sparse_slots = sparse_slots.strip().split(" ")
self.dense_slots = dense_slots.strip().split(" ")
self.dense_slots_shape = [reduce(mul, [int(j) for j in i.split(":")[1].strip("[]").split(",")]) for i in self.dense_slots]
self.dense_slots_shape = [
reduce(mul,
[int(j) for j in i.split(":")[1].strip("[]").split(",")])
for i in self.dense_slots
]
self.dense_slots = [i.split(":")[0] for i in self.dense_slots]
self.slots = self.dense_slots + self.sparse_slots
self.slot2index = {}
......@@ -93,10 +97,13 @@ class SlotReader(dg.MultiSlotDataGenerator):
slot = i
if not self.visit[slot]:
if i in self.dense_slots:
output[self.slot2index[i]][1].extend([self.padding] * self.dense_slots_shape[self.slot2index[i]])
output[self.slot2index[i]][1].extend(
[self.padding] *
self.dense_slots_shape[self.slot2index[i]])
else:
output[self.slot2index[i]][1].extend([self.padding])
else:
self.visit[slot] = False
yield output
return reader
......@@ -30,8 +30,10 @@ class Trainer(object):
def __init__(self, config=None):
self._status_processor = {}
self._place = fluid.CPUPlace()
self._exe = fluid.Executor(self._place)
self._exector_context = {}
self._context = {'status': 'uninit', 'is_exit': False}
self._config_yaml = config
......@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml):
train_dirname = os.path.dirname(train_location)
base_name = os.path.splitext(os.path.basename(train_location))[0]
sys.path.append(train_dirname)
trainer_class = envs.lazy_instance_by_fliename(
base_name, "UserDefineTraining")
trainer_class = envs.lazy_instance_by_fliename(base_name,
"UserDefineTraining")
return trainer_class
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
......@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer
"""
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor(
'train_pass', self.dataloader_train)
self.regist_context_processor('train_pass',
self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal)
......@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer):
def init(self, context):
self.model.train_net()
optimizer = self.model.optimizer()
optimizer_name = envs.get_global_env(
"hyper_parameters.optimizer", None, "train.model")
optimizer_name = envs.get_global_env("hyper_parameters.optimizer",
None, "train.model")
if optimizer_name not in ["", "sgd", "SGD", "Sgd"]:
os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0'
......@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer):
program = fluid.compiler.CompiledProgram(
fleet.main_program).with_data_parallel(
loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy())
loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy())
metrics_varnames = []
metrics_format = []
......@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer):
for i in range(epochs):
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True)
fleet.stop_worker()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddlerec.core.utils import envs
from paddlerec.core.trainer import Trainer
class CtrTrainer(Trainer):
"""R
"""
def __init__(self, config):
"""R
"""
Trainer.__init__(self, config)
self.global_config = config
self._metrics = {}
self.processor_register()
def processor_register(self):
role = MPISymetricRoleMaker()
fleet.init(role)
if fleet.is_server():
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('server_pass', self.server)
else:
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('train_pass', self.train)
self.regist_context_processor('terminal_pass', self.terminal)
def _get_dataset(self):
namespace = "train.reader"
inputs = self.model.get_inputs()
threads = envs.get_global_env("train.threads", None)
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
dataset.set_pipe_command(pipe_cmd)
dataset.set_batch_size(batch_size)
dataset.set_thread(threads)
file_list = [
os.path.join(train_data_path, x)
for x in os.listdir(train_data_path)
]
dataset.set_filelist(file_list)
return dataset
def instance(self, context):
models = envs.get_global_env("train.model.models")
model_class = envs.lazy_instance_by_fliename(models, "Model")
self.model = model_class(None)
context['status'] = 'init_pass'
def init(self, context):
"""R
"""
self.model.train_net()
optimizer = self.model.optimizer()
optimizer = fleet.distributed_optimizer(
optimizer, strategy={"use_cvm": False})
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
context['status'] = 'server_pass'
else:
self.fetch_vars = []
self.fetch_alias = []
self.fetch_period = self.model.get_fetch_period()
metrics = self.model.get_metrics()
if metrics:
self.fetch_vars = metrics.values()
self.fetch_alias = metrics.keys()
context['status'] = 'train_pass'
def server(self, context):
fleet.run_server()
fleet.stop_worker()
context['is_exit'] = True
def train(self, context):
self._exe.run(fluid.default_startup_program())
fleet.init_worker()
dataset = self._get_dataset()
shuf = np.array([fleet.worker_index()])
gs = shuf * 0
fleet._role_maker._node_type_comm.Allreduce(shuf, gs)
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(
), fleet.worker_num(), gs))
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
context['status'] = 'terminal_pass'
fleet.stop_worker()
def terminal(self, context):
print("terminal ended.")
context['is_exit'] = True
此差异已折叠。
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor(
'train_pass', self.dataloader_train)
self.regist_context_processor('train_pass',
self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal)
......@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer):
if state == "TRAIN":
inputs = self.model.get_inputs()
namespace = "train.reader"
train_data_path = envs.get_global_env(
"train_data_path", None, namespace)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
else:
inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader"
train_data_path = envs.get_global_env(
"test_data_path", None, namespace)
train_data_path = envs.get_global_env("test_data_path", None,
namespace)
threads = int(envs.get_runtime_environ("train.trainer.threads"))
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(
reader, reader_class, state, self._config_yaml)
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
self._config_yaml)
if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
train_data_path = os.path.join(
package_base, train_data_path.split("::")[1])
train_data_path = os.path.join(package_base,
train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer):
ins = self._get_dataset_ins()
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True)
fleet.stop_worker()
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None,
"train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor('train_pass', self.dataloader_train)
......@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer):
reader = self._get_dataloader("TRAIN")
epochs = envs.get_global_env("train.epochs")
program = fluid.compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=self.model.get_avg_cost().name)
program = fluid.compiler.CompiledProgram(fluid.default_main_program(
)).with_data_parallel(loss_name=self.model.get_avg_cost().name)
metrics_varnames = []
metrics_format = []
......@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer):
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times))
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=False)
context['status'] = 'infer_pass'
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"]
class TDMClusterTrainer(ClusterTrainer):
def server(self, context):
namespace = "train.startup"
init_model_path = envs.get_global_env(
"cluster.init_model_path", "", namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
assert init_model_path != "", "Cluster train must has init_model for TDM"
fleet.init_server(init_model_path)
logger.info("TDM: load model from {}".format(init_model_path))
......@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer):
self._exe.run(fleet.startup_program)
namespace = "train.startup"
load_tree = envs.get_global_env(
"tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", "", namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace)
self.tree_info_path = envs.get_global_env(
"tree.tree_info_path", "", namespace)
save_init_model = envs.get_global_env(
"cluster.save_init_model", False, namespace)
init_model_path = envs.get_global_env(
"cluster.init_model_path", "", namespace)
load_tree = envs.get_global_env("tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
namespace)
self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
"", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
namespace)
save_init_model = envs.get_global_env("cluster.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
if load_tree:
# covert tree to tensor, set it into Fluid's variable.
for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor()
param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name)
param_t.set(param_array.astype('int32'), self._place)
......@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer):
def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format(
travel_array.shape[0]))
logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
0]))
return travel_array
def _tdm_layer_prepare(self):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer",
"TDM_Tree_Info", "TDM_Tree_Emb"]
special_param = [
"TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info", "TDM_Tree_Emb"
]
class TDMSingleTrainer(SingleTrainer):
def startup(self, context):
namespace = "train.startup"
load_persistables = envs.get_global_env(
"single.load_persistables", False, namespace)
load_persistables = envs.get_global_env("single.load_persistables",
False, namespace)
persistables_model_path = envs.get_global_env(
"single.persistables_model_path", "", namespace)
load_tree = envs.get_global_env(
"tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", "", namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace)
self.tree_info_path = envs.get_global_env(
"tree.tree_info_path", "", namespace)
self.tree_emb_path = envs.get_global_env(
"tree.tree_emb_path", "", namespace)
save_init_model = envs.get_global_env(
"single.save_init_model", False, namespace)
init_model_path = envs.get_global_env(
"single.init_model_path", "", namespace)
load_tree = envs.get_global_env("tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
namespace)
self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
"", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
namespace)
self.tree_emb_path = envs.get_global_env("tree.tree_emb_path", "",
namespace)
save_init_model = envs.get_global_env("single.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("single.init_model_path", "",
namespace)
self._exe.run(fluid.default_startup_program())
if load_persistables:
......@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer):
if load_tree:
# covert tree to tensor, set it into Fluid's variable.
for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor()
param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name)
if param_name == 'TDM_Tree_Emb':
param_t.set(param_array.astype('float32'), self._place)
......@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer):
def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format(
travel_array.shape[0]))
logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
0]))
return travel_array
def _tdm_emb_prepare(self):
"""load tdm tree param from npy/list file"""
emb_array = np.load(self.tree_emb_path)
logger.info("TDM Tree node nums from emb: {}".format(
emb_array.shape[0]))
logger.info("TDM Tree node nums from emb: {}".format(emb_array.shape[
0]))
return emb_array
def _tdm_layer_prepare(self):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with DistributeTranspiler
"""
......@@ -39,9 +38,12 @@ class TranspileTrainer(Trainer):
self.increment_models = []
def processor_register(self):
print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first")
print(
"Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first"
)
def _get_dataloader(self, state="TRAIN"):
if state == "TRAIN":
dataloader = self.model._data_loader
namespace = "train.reader"
......@@ -59,12 +61,14 @@ class TranspileTrainer(Trainer):
if sparse_slots is None and dense_slots is None:
reader_class = envs.get_global_env("class", None, namespace)
reader = dataloader_instance.dataloader(
reader_class, state, self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class, class_name)
reader = dataloader_instance.dataloader(reader_class, state,
self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class,
class_name)
reader_ins = reader_class(self._config_yaml)
else:
reader = dataloader_instance.slotdataloader("", state, self._config_yaml)
reader = dataloader_instance.slotdataloader("", state,
self._config_yaml)
reader_ins = SlotReader(self._config_yaml)
if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
......@@ -94,13 +98,13 @@ class TranspileTrainer(Trainer):
if state == "TRAIN":
inputs = self.model.get_inputs()
namespace = "train.reader"
train_data_path = envs.get_global_env(
"train_data_path", None, namespace)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
else:
inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader"
train_data_path = envs.get_global_env(
"test_data_path", None, namespace)
train_data_path = envs.get_global_env("test_data_path", None,
namespace)
sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
dense_slots = envs.get_global_env("dense_slots", None, namespace)
......@@ -112,8 +116,8 @@ class TranspileTrainer(Trainer):
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
if sparse_slots is None and dense_slots is None:
pipe_cmd = "python {} {} {} {}".format(
reader, reader_class, state, self._config_yaml)
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
self._config_yaml)
else:
padding = envs.get_global_env("padding", 0, namespace)
pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
......@@ -123,8 +127,8 @@ class TranspileTrainer(Trainer):
if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
train_data_path = os.path.join(
package_base, train_data_path.split("::")[1])
train_data_path = os.path.join(package_base,
train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -140,11 +144,11 @@ class TranspileTrainer(Trainer):
debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
if debug_mode:
print(
"--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0]))
print("--- Dataset Debug Mode Begin , show pre 10 data of {}---".
format(file_list[0]))
os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
print(
"--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0]))
print("--- Dataset Debug Mode End , show pre 10 data of {}---".
format(file_list[0]))
exit(0)
return dataset
......@@ -166,27 +170,29 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False):
return
feed_varnames = envs.get_global_env(
"save.inference.feed_varnames", None, namespace)
feed_varnames = envs.get_global_env("save.inference.feed_varnames",
None, namespace)
fetch_varnames = envs.get_global_env(
"save.inference.fetch_varnames", None, namespace)
if feed_varnames is None or fetch_varnames is None:
return
fetch_vars = [fluid.default_main_program().global_block().vars[varname]
for varname in fetch_varnames]
dirname = envs.get_global_env(
"save.inference.dirname", None, namespace)
fetch_vars = [
fluid.default_main_program().global_block().vars[varname]
for varname in fetch_varnames
]
dirname = envs.get_global_env("save.inference.dirname", None,
namespace)
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
if is_fleet:
fleet.save_inference_model(
self._exe, dirname, feed_varnames, fetch_vars)
fleet.save_inference_model(self._exe, dirname, feed_varnames,
fetch_vars)
else:
fluid.io.save_inference_model(
dirname, feed_varnames, fetch_vars, self._exe)
fluid.io.save_inference_model(dirname, feed_varnames,
fetch_vars, self._exe)
self.inference_models.append((epoch_id, dirname))
def save_persistables():
......@@ -196,8 +202,8 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False):
return
dirname = envs.get_global_env(
"save.increment.dirname", None, namespace)
dirname = envs.get_global_env("save.increment.dirname", None,
namespace)
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
......@@ -275,10 +281,9 @@ class TranspileTrainer(Trainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames,
return_numpy=is_return_numpy)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames,
return_numpy=is_return_numpy)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......
......@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util
class DatasetHolder(object):
"""
Dataset Base
Dataset Holder
"""
__metaclass__ = abc.ABCMeta
......@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder):
Dataset.__init__(self, config)
if 'data_donefile' not in config or config['data_donefile'] is None:
config['data_donefile'] = config['data_path'] + "/to.hadoop.done"
self._path_generator = util.PathGenerator({'templates': [
{'name': 'data_path', 'template': config['data_path']},
{'name': 'donefile_path', 'template': config['data_donefile']}
]})
self._split_interval = config['split_interval'] # data split N mins per dir
self._path_generator = util.PathGenerator({
'templates': [{
'name': 'data_path',
'template': config['data_path']
}, {
'name': 'donefile_path',
'template': config['data_donefile']
}]
})
self._split_interval = config[
'split_interval'] # data split N mins per dir
self._data_file_handler = fs.FileHandler(config)
def _format_data_time(self, daytime_str, time_window_mins):
......@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder):
return None, 0
if mins_of_day % self._split_interval != 0:
skip_mins = self._split_interval - (mins_of_day % self._split_interval)
skip_mins = self._split_interval - (mins_of_day %
self._split_interval)
data_time = data_time + datetime.timedelta(minutes=skip_mins)
time_window_mins = time_window_mins - skip_mins
return data_time, time_window_mins
......@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder):
True/False
"""
is_ready = True
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins)
data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0:
file_path = self._path_generator.generate_path('donefile_path', {'time_format': data_time})
file_path = self._path_generator.generate_path(
'donefile_path', {'time_format': data_time})
if not self._data_file_handler.is_exist(file_path):
is_ready = False
break
time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return is_ready
def get_file_list(self, daytime_str, time_window_mins, node_num=1, node_idx=0):
def get_file_list(self,
daytime_str,
time_window_mins,
node_num=1,
node_idx=0):
"""
data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx]
Args:
......@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder):
list, data_shard[node_idx]
"""
data_file_list = []
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins)
data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0:
file_path = self._path_generator.generate_path('data_path', {'time_format': data_time})
file_path = self._path_generator.generate_path(
'data_path', {'time_format': data_time})
sub_file_list = self._data_file_handler.ls(file_path)
for sub_file in sub_file_list:
sub_file_name = self._data_file_handler.get_file_name(sub_file)
if not sub_file_name.startswith(self._config['filename_prefix']):
if not sub_file_name.startswith(self._config[
'filename_prefix']):
continue
if hash(sub_file_name) % node_num == node_idx:
data_file_list.append(sub_file)
time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return data_file_list
def _alloc_dataset(self, file_list):
""" """
dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type'])
dataset = fluid.DatasetFactory().create_dataset(self._config[
'dataset_type'])
dataset.set_batch_size(self._config['batch_size'])
dataset.set_thread(self._config['load_thread'])
dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi'])
dataset.set_hdfs_config(self._config['fs_name'],
self._config['fs_ugi'])
dataset.set_pipe_command(self._config['data_converter'])
dataset.set_filelist(file_list)
dataset.set_use_var(self._config['data_vars'])
......@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder):
while self.check_ready(begin_time, windown_min) == False:
print("dataset not ready, time:" + begin_time)
time.sleep(30)
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx'])
file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].load_into_memory()
else:
......@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min = params['time_window_min']
if begin_time not in self._datasets:
if self.check_ready(begin_time, windown_min):
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx'])
file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].preload_into_memory(self._config['preload_thread'])
self._datasets[begin_time].preload_into_memory(self._config[
'preload_thread'])
return True
return False
......
......@@ -17,10 +17,11 @@ import sys
from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.reader import SlotReader
from paddlerec.core.utils import envs
if len(sys.argv) < 4:
raise ValueError("reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path")
raise ValueError(
"reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path"
)
reader_package = sys.argv[1]
......
......@@ -95,7 +95,7 @@ def path_adapter(path):
l_p = path.split("paddlerec.")[1].replace(".", "/")
return os.path.join(package, l_p)
else:
return path
return path
def windows_path_converter(path):
......@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None):
def lazy_instance_by_package(package, class_name):
models = get_global_env("train.model.models")
model_package = __import__(
package, globals(), locals(), package.split("."))
model_package = __import__(package,
globals(), locals(), package.split("."))
instance = getattr(model_package, class_name)
return instance
......@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name):
sys.path.append(dirname)
package = os.path.splitext(os.path.basename(abs))[0]
model_package = __import__(
package, globals(), locals(), package.split("."))
model_package = __import__(package,
globals(), locals(), package.split("."))
instance = getattr(model_package, class_name)
return instance
......@@ -189,8 +189,7 @@ def get_platform():
def find_free_port():
def __free_port():
with closing(socket.socket(socket.AF_INET,
socket.SOCK_STREAM)) as s:
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
......
......@@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
def is_afs_path(path):
"""R
"""is_afs_path
"""
if path.startswith("afs") or path.startswith("hdfs"):
return True
......@@ -133,8 +133,9 @@ class FileHandler(object):
if mode.find('a') >= 0:
org_content = self._hdfs_client.cat(dest_path)
content = content + org_content
self._local_fs_client.write(content, temp_local_file,
mode) # fleet hdfs_client only support upload, so write tmp file
self._local_fs_client.write(
content, temp_local_file, mode
) # fleet hdfs_client only support upload, so write tmp file
self._hdfs_client.delete(dest_path + ".tmp")
self._hdfs_client.upload(dest_path + ".tmp", temp_local_file)
self._hdfs_client.delete(dest_path + ".bak")
......@@ -158,7 +159,8 @@ class FileHandler(object):
files = []
if is_afs_path(path):
files = self._hdfs_client.ls(path)
files = [path + '/' + self.get_file_name(fi) for fi in files] # absulte path
files = [path + '/' + self.get_file_name(fi)
for fi in files] # absulte path
else:
files = self._local_fs_client.ls(path)
files = [path + '/' + fi for fi in files] # absulte path
......
......@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs
def save_program_proto(path, program=None):
if program is None:
_program = fluid.default_main_program()
else:
......@@ -175,7 +176,8 @@ class PathGenerator(object):
"""
if template_name in self._templates:
if 'time_format' in param:
str = param['time_format'].strftime(self._templates[template_name])
str = param['time_format'].strftime(self._templates[
template_name])
return str.format(**param)
return self._templates[template_name].format(**param)
else:
......@@ -198,31 +200,39 @@ class TimeTrainPass(object):
self._begin_day = make_datetime(day_fields[0].strip())
if len(day_fields) == 1 or len(day_fields[1]) == 0:
# 100 years, meaning to continuous running
self._end_day = self._begin_day + datetime.timedelta(days=36500)
self._end_day = self._begin_day + datetime.timedelta(
days=36500)
else:
# example: 2020212+10
run_day = int(day_fields[1].strip())
self._end_day = self._begin_day + datetime.timedelta(days=run_day)
self._end_day = self._begin_day + datetime.timedelta(
days=run_day)
else:
# example: {20191001..20191031}
days = os.popen("echo -n " + self._config['days']).read().split(" ")
days = os.popen("echo -n " + self._config['days']).read().split(
" ")
self._begin_day = make_datetime(days[0])
self._end_day = make_datetime(days[len(days) - 1])
self._checkpoint_interval = self._config['checkpoint_interval']
self._dump_inference_interval = self._config['dump_inference_interval']
self._interval_per_pass = self._config['train_time_interval'] # train N min data per pass
self._interval_per_pass = self._config[
'train_time_interval'] # train N min data per pass
self._pass_id = 0
self._inference_pass_id = 0
self._pass_donefile_handler = None
if 'pass_donefile_name' in self._config:
self._train_pass_donefile = global_config['output_path'] + '/' + self._config['pass_donefile_name']
self._train_pass_donefile = global_config[
'output_path'] + '/' + self._config['pass_donefile_name']
if fs.is_afs_path(self._train_pass_donefile):
self._pass_donefile_handler = fs.FileHandler(global_config['io']['afs'])
self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['afs'])
else:
self._pass_donefile_handler = fs.FileHandler(global_config['io']['local_fs'])
self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['local_fs'])
last_done = self._pass_donefile_handler.cat(self._train_pass_donefile).strip().split('\n')[-1]
last_done = self._pass_donefile_handler.cat(
self._train_pass_donefile).strip().split('\n')[-1]
done_fileds = last_done.split('\t')
if len(done_fileds) > 4:
self._base_key = done_fileds[1]
......@@ -236,15 +246,18 @@ class TimeTrainPass(object):
"""
return 24 * 60 / self._interval_per_pass
def save_train_progress(self, day, pass_id, base_key, model_path, is_checkpoint):
def save_train_progress(self, day, pass_id, base_key, model_path,
is_checkpoint):
"""R
"""
if is_checkpoint:
self._checkpoint_pass_id = pass_id
self._checkpoint_model_path = model_path
done_content = "%s\t%s\t%s\t%s\t%d\n" % (day, base_key,
self._checkpoint_model_path, self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content, self._train_pass_donefile, 'a')
done_content = "%s\t%s\t%s\t%s\t%d\n" % (
day, base_key, self._checkpoint_model_path,
self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content,
self._train_pass_donefile, 'a')
pass
def init_pass_by_id(self, date_str, pass_id):
......@@ -286,12 +299,14 @@ class TimeTrainPass(object):
if self._pass_id < 1:
self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M"))
else:
next_time = self._current_train_time + datetime.timedelta(minutes=self._interval_per_pass)
next_time = self._current_train_time + datetime.timedelta(
minutes=self._interval_per_pass)
if (next_time - self._end_day).total_seconds() > 0:
has_next = False
else:
self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M"))
if has_next and (self._inference_pass_id < self._pass_id or self._pass_id < old_pass_id):
if has_next and (self._inference_pass_id < self._pass_id or
self._pass_id < old_pass_id):
self._inference_pass_id = self._pass_id - 1
return has_next
......@@ -319,9 +334,11 @@ class TimeTrainPass(object):
Return:
date(current_train_time + delta_day)
"""
return (self._current_train_time + datetime.timedelta(days=delta_day)).strftime("%Y%m%d")
return (self._current_train_time + datetime.timedelta(days=delta_day)
).strftime("%Y%m%d")
def timestamp(self, delta_day=0):
"""R
"""
return (self._current_train_time + datetime.timedelta(days=delta_day)).timestamp()
return (self._current_train_time + datetime.timedelta(days=delta_day)
).timestamp()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# PaddleRec Benchmark
> 占位
\ No newline at end of file
> 占位
# PaddleRec 贡献代码
> 占位
\ No newline at end of file
> 占位
......@@ -279,4 +279,4 @@ class Metric(object):
pass
```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
\ No newline at end of file
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
......@@ -7,5 +7,3 @@
### K8S集群运行分布式
> 占位
# 常见问题FAQ
> 占位
\ No newline at end of file
> 占位
# PaddleRec 单机训练
> 占位
\ No newline at end of file
> 占位
......@@ -12,4 +12,3 @@
| 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ |
# PaddleRec 模型调参
> 占位
\ No newline at end of file
> 占位
# PaddleRec 离线预测
\ No newline at end of file
# PaddleRec 离线预测
......@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -37,4 +37,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
......@@ -31,7 +31,8 @@ class Model(ModelBase):
def train_net(self):
""" network definition """
data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64')
data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
......@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer
fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim)
# softmax layer
prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax")
prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from paddlerec.core.reader import Reader
......@@ -38,7 +37,8 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
......@@ -47,4 +47,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
......@@ -26,8 +26,10 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
self.cost = None
self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace)
self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace)
......@@ -35,8 +37,9 @@ class Model(ModelBase):
self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
def train_net(self):
""" network definition """
text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64')
""" network"""
text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data(
......@@ -45,13 +48,19 @@ class Model(ModelBase):
self._data_var = [text, pos_tag, neg_tag]
text_emb = fluid.embedding(
input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb")
input=text,
size=[self.vocab_text_size, self.emb_dim],
param_attr="text_emb")
text_emb = fluid.layers.squeeze(input=text_emb, axes=[1])
pos_tag_emb = fluid.embedding(
input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb")
input=pos_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1])
neg_tag_emb = fluid.embedding(
input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb")
input=neg_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1])
conv_1d = fluid.nets.sequence_conv_pool(
......@@ -65,7 +74,8 @@ class Model(ModelBase):
size=self.emb_dim,
param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_text_hid = fluid.layers.sequence_expand_as(
x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(
input=mul_cos_neg, new_dim=self.neg_size)
......@@ -74,7 +84,10 @@ class Model(ModelBase):
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'),
input=cos_pos,
shape=[-1, 1],
value=self.margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
......@@ -85,7 +98,7 @@ class Model(ModelBase):
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
self.cost = avg_cost
self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos
......@@ -96,7 +109,8 @@ class Model(ModelBase):
return self.metrics
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import numpy as np
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -23,13 +23,26 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace)
Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i
in range(Neg)]
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
Neg = envs.get_global_env("hyper_parameters.NEG", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self.doc_negs = [
fluid.data(
name="doc_neg_" + str(i),
shape=[-1, TRIGRAM_D],
dtype="float32",
lod_level=0) for i in range(Neg)
]
self._data_var.append(self.query)
self._data_var.append(self.doc_pos)
for input in self.doc_negs:
......@@ -37,16 +50,24 @@ class Model(ModelBase):
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace)
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
self._namespace)
def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data]
for i in range(len(hidden_layers)):
xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i])
xavier = fluid.initializer.Xavier(
uniform=True,
fan_in=fc_inputs[-1].shape[1],
fan_out=hidden_layers[i])
out = fluid.layers.fc(input=fc_inputs[-1],
size=hidden_layers[i],
act=hidden_acts[i],
......@@ -56,8 +77,10 @@ class Model(ModelBase):
fc_inputs.append(out)
return fc_inputs[-1]
query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
query_fc = fc(self.query, hidden_layers, hidden_acts,
['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
if is_infer:
......@@ -65,13 +88,17 @@ class Model(ModelBase):
R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts,
['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)])
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
concat_Rs = fluid.layers.concat(
input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss)
......@@ -91,18 +118,28 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.SGD(learning_rate)
return optimizer
def infer_input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self._infer_data_var = [self.query, self.doc_pos]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
......
......@@ -22,4 +22,3 @@ mkdir -p data/train
mkdir -p data/test
python generate_synthetic_data.py
......@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model")
self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = []
for i in range(self.query_slots):
......
......@@ -21,7 +21,11 @@ class Dataset:
class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000):
def __init__(self,
sparse_feature_dim,
query_slot_num,
title_slot_num,
dataset_size=10000):
# ids are randomly generated
self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim
......@@ -46,14 +50,20 @@ class SyntheticDataset(Dataset):
for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
pt_slot = [str(fea) + ':' + str(i + self.query_slot_num) for fea in pt_slot]
pt_slot = [
str(fea) + ':' + str(i + self.query_slot_num)
for fea in pt_slot
]
pos_title_slots += pt_slot
if is_train:
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in
nt_slot]
nt_slot = [
str(fea) + ':' +
str(i + self.query_slot_num + self.title_slot_num)
for fea in nt_slot
]
neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots
else:
......@@ -76,7 +86,8 @@ if __name__ == '__main__':
query_slots = 1
title_slots = 1
dataset_size = 10
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size)
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
dataset_size)
train_reader = dataset.train()
test_reader = dataset.test()
......
......@@ -103,12 +103,18 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace)
title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace)
query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None, self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None, self._namespace)
query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
None, self._namespace)
title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
None, self._namespace)
query_encode_dim = envs.get_global_env(
"hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
self._namespace)
factory = SimpleEncoderFactory()
self.query_encoders = [
factory.create(query_encoder, query_encode_dim)
......@@ -119,10 +125,13 @@ class Model(ModelBase):
for i in range(title_slots)
]
self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace)
self.emb_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
None, self._namespace)
self.emb_shape = [self.emb_size, self.emb_dim]
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
None, self._namespace)
self.margin = 0.1
def input(self, is_train=True):
......@@ -133,8 +142,10 @@ class Model(ModelBase):
]
self.pt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders)), shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))
name="%d" % (i + len(self.query_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
]
if is_train == False:
......@@ -142,9 +153,11 @@ class Model(ModelBase):
self.nt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1,
dtype='int64')
for i in range(len(self.title_encoders))
name="%d" %
(i + len(self.query_encoders) + len(self.title_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
]
return self.q_slots + self.pt_slots + self.nt_slots
......@@ -153,11 +166,15 @@ class Model(ModelBase):
res = self.input()
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def get_acc(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
......@@ -190,10 +207,12 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
nt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
]
# concat multi view for query, pos_title, neg_title
......@@ -252,7 +271,8 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
return optimizer
......@@ -261,7 +281,10 @@ class Model(ModelBase):
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
......@@ -281,7 +304,8 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
......
......@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model")
self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = []
for i in range(self.query_slots):
......
......@@ -37,4 +37,3 @@
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
all_field_id = [
'101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
......
......@@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
all_field_id = [
'101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
......
......@@ -28,11 +28,13 @@ class Model(ModelBase):
init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1])
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(loc=0.0,
scale=init_stddev * scales))
p_attr = fluid.param_attr.ParamAttr(
name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data,
size=out_dim,
......@@ -44,7 +46,11 @@ class Model(ModelBase):
def input_data(self):
sparse_input_ids = [
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23)
fluid.data(
name="field_" + str(i),
shape=[-1, 1],
dtype="int64",
lod_level=1) for i in range(0, 23)
]
label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64")
label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64")
......@@ -55,19 +61,23 @@ class Model(ModelBase):
def net(self, inputs, is_infer=False):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None,
self._namespace)
emb = []
for data in inputs[0:-2]:
feat_emb = fluid.embedding(input=data,
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)
),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum')
feat_emb = fluid.embedding(
input=data,
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(
name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(
input=feat_emb, pool_type='sum')
emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1)
......@@ -85,14 +95,20 @@ class Model(ModelBase):
ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1]
ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctr_prop_one = fluid.layers.slice(
ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(
cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one,
cvr_prop_one)
ctcvr_prop = fluid.layers.concat(
input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(
input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(
input=ctcvr_prop, label=ctcvr_buy)
if is_infer:
self._infer_results["AUC_ctr"] = auc_ctr
......@@ -100,7 +116,8 @@ class Model(ModelBase):
return
loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
loss_ctcvr = fluid.layers.cross_entropy(
input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost)
......@@ -117,5 +134,8 @@ class Model(ModelBase):
def infer_net(self):
self._infer_data_var = self.input_data()
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True)
......@@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
pass
def generate_sample(self, line):
......
......@@ -24,6 +24,7 @@ class TrainReader(Reader):
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
......
......@@ -23,44 +23,58 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def MMOE(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None, self._namespace)
gate_num = envs.get_global_env("hyper_parameters.gate_num", None, self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace)
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
feature_size = envs.get_global_env("hyper_parameters.feature_size",
None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None,
self._namespace)
gate_num = envs.get_global_env("hyper_parameters.gate_num", None,
self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None,
self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer:
self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital])
# f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper
expert_outputs = []
for i in range(0, expert_num):
expert_output = fluid.layers.fc(input=input_data,
size=expert_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_output = fluid.layers.fc(
input=input_data,
size=expert_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_outputs.append(expert_output)
expert_concat = fluid.layers.concat(expert_outputs, axis=1)
expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size])
expert_concat = fluid.layers.reshape(expert_concat,
[-1, expert_num, expert_size])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers = []
for i in range(0, gate_num):
cur_gate = fluid.layers.fc(input=input_data,
size=expert_num,
act='softmax',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
cur_gate = fluid.layers.fc(
input=input_data,
size=expert_num,
act='softmax',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
# f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x))
cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.elementwise_mul(
expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1)
# Build tower layer
cur_tower = fluid.layers.fc(input=cur_gate_expert,
......@@ -74,25 +88,33 @@ class Model(ModelBase):
output_layers.append(out)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
pred_income = fluid.layers.clip(
output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(
label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(
label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost_income = fluid.layers.cross_entropy(
input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
avg_cost_income = fluid.layers.mean(x=cost_income)
avg_cost_marital = fluid.layers.mean(x=cost_marital)
......
......@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
......@@ -24,27 +24,38 @@ class Model(ModelBase):
def model(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace)
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace)
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None, self._namespace)
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
feature_size = envs.get_global_env("hyper_parameters.feature_size",
None, self._namespace)
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None,
self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
self._namespace)
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None,
self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer:
self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital])
bottom_output = fluid.layers.fc(input=input_data,
size=bottom_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
bottom_output = fluid.layers.fc(
input=input_data,
size=bottom_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
# Build tower layer from bottom layer
output_layers = []
......@@ -59,26 +70,34 @@ class Model(ModelBase):
name='output_layer_' + str(index))
output_layers.append(output_layer)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
pred_income = fluid.layers.clip(
output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(
label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(
label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost_income = fluid.layers.cross_entropy(
input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1)
avg_cost = fluid.layers.mean(x=cost)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import io
......
......@@ -26,8 +26,8 @@ from collections import Counter
import os
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator):
if idx == 2 else math.log(1 + float(features[idx])))
for idx in self.cat_idx_:
if features[idx] == '' or features[
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
label_feat_list[idx].append(0)
else:
label_feat_list[idx].append(self.cat_feat_idx_dict_list[
idx - 14][features[idx]])
idx - 14][features[idx]])
label_feat_list[0].append(int(features[0]))
return label_feat_list
......@@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, absolute_import, division
import os
......
......@@ -25,12 +25,18 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def init_network(self):
self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace)
self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace)
self.l2_reg_cross = envs.get_global_env("hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", None, self._namespace)
self.clip_by_norm = envs.get_global_env("hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", None, self._namespace)
self.cross_num = envs.get_global_env("hyper_parameters.cross_num",
None, self._namespace)
self.dnn_hidden_units = envs.get_global_env(
"hyper_parameters.dnn_hidden_units", None, self._namespace)
self.l2_reg_cross = envs.get_global_env(
"hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn",
None, self._namespace)
self.clip_by_norm = envs.get_global_env(
"hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num",
None, self._namespace)
self.sparse_inputs = self._sparse_data_var[1:]
self.dense_inputs = self._dense_data_var
......@@ -43,7 +49,8 @@ class Model(ModelBase):
cat_feat_dims_dict[spls[0]] = int(spls[1])
self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict(
)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", None, self._namespace)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
None, self._namespace)
self.dense_feat_names = [i.name for i in self.dense_inputs]
self.sparse_feat_names = [i.name for i in self.sparse_inputs]
......@@ -55,16 +62,19 @@ class Model(ModelBase):
self.net_input = None
self.loss = None
def _create_embedding_input(self):
# sparse embedding
sparse_emb_dict = OrderedDict()
for var in self.sparse_inputs:
sparse_emb_dict[var.name] = fluid.embedding(input=var,
size=[self.feat_dims_dict[var.name] + 1,
6 * int(pow(self.feat_dims_dict[var.name], 0.25))
],is_sparse=self.is_sparse)
sparse_emb_dict[var.name] = fluid.embedding(
input=var,
size=[
self.feat_dims_dict[var.name] + 1,
6 * int(pow(self.feat_dims_dict[var.name], 0.25))
],
is_sparse=self.is_sparse)
# combine dense and sparse_emb
dense_input_list = self.dense_inputs
sparse_emb_list = list(sparse_emb_dict.values())
......@@ -114,10 +124,11 @@ class Model(ModelBase):
def train_net(self):
self.model._init_slots()
self.init_network()
self.net_input = self._create_embedding_input()
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False)
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units,
self.dnn_use_bn, False)
cross_out, l2_reg_cross_loss = self._cross_net(self.net_input,
self.cross_num)
......@@ -134,9 +145,11 @@ class Model(ModelBase):
input=prob_2d, label=label_int, slide_steps=0)
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
# logloss
logloss = fluid.layers.log_loss(self.prob, fluid.layers.cast(self.target_input, dtype='float32'))
logloss = fluid.layers.log_loss(
self.prob, fluid.layers.cast(
self.target_input, dtype='float32'))
self.avg_logloss = fluid.layers.reduce_mean(logloss)
# reg_coeff * l2_reg_cross
......@@ -145,7 +158,8 @@ class Model(ModelBase):
self._cost = self.loss
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -19,8 +19,9 @@ try:
import cPickle as pickle
except ImportError:
import pickle
class TrainReader(dg.MultiSlotDataGenerator):
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
self.categorical_range_ = range(14, 40)
# load preprocessed feature dict
self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
......@@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator):
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy
from collections import Counter
......
......@@ -27,21 +27,26 @@ class Model(ModelBase):
def deepfm_net(self):
init_value_ = 0.1
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace)
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
reg = envs.get_global_env("hyper_parameters.reg", 1e-4,
self._namespace)
first_weights_re = fluid.embedding(
input=feat_idx,
is_sparse=True,
......@@ -55,7 +60,8 @@ class Model(ModelBase):
regularizer=fluid.regularizer.L1DecayRegularizer(reg)))
first_weights = fluid.layers.reshape(
first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1)
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value),
1)
# ------------------------- second order term --------------------------
......@@ -68,7 +74,8 @@ class Model(ModelBase):
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
loc=0.0,
scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
feat_embeddings = fluid.layers.reshape(
feat_embeddings_re,
shape=[-1, num_field,
......@@ -76,8 +83,8 @@ class Model(ModelBase):
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# sum_square part
summed_features_emb = fluid.layers.reduce_sum(feat_embeddings,
1) # None * embedding_size
summed_features_emb = fluid.layers.reduce_sum(
feat_embeddings, 1) # None * embedding_size
summed_features_emb_square = fluid.layers.square(
summed_features_emb) # None * embedding_size
......@@ -88,13 +95,16 @@ class Model(ModelBase):
squared_features_emb, 1) # None * embedding_size
y_second_order = 0.5 * fluid.layers.reduce_sum(
summed_features_emb_square - squared_sum_features_emb, 1,
summed_features_emb_square - squared_sum_features_emb,
1,
keep_dim=True) # None * 1
# ------------------------- DNN --------------------------
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim])
for s in layer_sizes:
......@@ -121,7 +131,8 @@ class Model(ModelBase):
# ------------------------- DeepFM --------------------------
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn)
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order +
y_dnn)
def train_net(self):
self.model._init_slots()
......@@ -129,7 +140,8 @@ class Model(ModelBase):
# ------------------------- Cost(logloss) --------------------------
cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32"))
cost = fluid.layers.log_loss(
input=self.predict, label=fluid.layers.cast(self.label, "float32"))
avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost
......@@ -145,7 +157,8 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import pickle
import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
......@@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def config_read(self, config_path):
with open(config_path, "r") as fin:
user_count = int(fin.readline().strip())
item_count = int(fin.readline().strip())
cat_count = int(fin.readline().strip())
return user_count, item_count, cat_count
def din_attention(self, hist, target_expand, mask):
"""activation weight"""
......@@ -58,56 +58,66 @@ class Model(ModelBase):
out = fluid.layers.matmul(weight, hist)
out = fluid.layers.reshape(x=out, shape=[0, hidden_size])
return out
def train_net(self):
seq_len = -1
self.item_emb_size = envs.get_global_env("hyper_parameters.item_emb_size", 64, self._namespace)
self.cat_emb_size = envs.get_global_env("hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", self._namespace)
self.item_emb_size = envs.get_global_env(
"hyper_parameters.item_emb_size", 64, self._namespace)
self.cat_emb_size = envs.get_global_env(
"hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid",
self._namespace)
#item_emb_size = 64
#cat_emb_size = 64
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", False, self._namespace)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False, self._namespace)
#significant for speeding up the training process
self.config_path = envs.get_global_env("hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
self.config_path = envs.get_global_env(
"hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env(
"hyper_parameters.use_DataLoader", False, self._namespace)
user_count, item_count, cat_count = self.config_read(self.config_path)
item_emb_attr = fluid.ParamAttr(name="item_emb")
cat_emb_attr = fluid.ParamAttr(name="cat_emb")
hist_item_seq = fluid.data(
name="hist_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_item_seq)
hist_cat_seq = fluid.data(
name="hist_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_cat_seq)
target_item = fluid.data(name="target_item", shape=[None], dtype="int64")
target_item = fluid.data(
name="target_item", shape=[None], dtype="int64")
self._data_var.append(target_item)
target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64")
self._data_var.append(target_cat)
label = fluid.data(name="label", shape=[None, 1], dtype="float32")
self._data_var.append(label)
mask = fluid.data(name="mask", shape=[None, seq_len, 1], dtype="float32")
mask = fluid.data(
name="mask", shape=[None, seq_len, 1], dtype="float32")
self._data_var.append(mask)
target_item_seq = fluid.data(
name="target_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_item_seq)
target_cat_seq = fluid.data(
name="target_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_cat_seq)
if self.use_DataLoader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=10000, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=10000,
use_double_buffer=False,
iterable=False)
hist_item_emb = fluid.embedding(
input=hist_item_seq,
size=[item_count, self.item_emb_size],
......@@ -149,7 +159,8 @@ class Model(ModelBase):
size=[item_count, 1],
param_attr=fluid.initializer.Constant(value=0.0))
hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2)
hist_seq_concat = fluid.layers.concat(
[hist_item_emb, hist_cat_emb], axis=2)
target_seq_concat = fluid.layers.concat(
[target_item_seq_emb, target_cat_seq_emb], axis=2)
target_concat = fluid.layers.concat(
......@@ -157,21 +168,22 @@ class Model(ModelBase):
out = self.din_attention(hist_seq_concat, target_seq_concat, mask)
out_fc = fluid.layers.fc(name="out_fc",
input=out,
size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1)
input=out,
size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1)
embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1)
fc1 = fluid.layers.fc(name="fc1",
input=embedding_concat,
size=80,
act=self.act)
input=embedding_concat,
size=80,
act=self.act)
fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act)
fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1)
logit = fc3 + item_b
loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logit, label=label)
avg_loss = fluid.layers.mean(loss)
self._cost = avg_loss
......@@ -179,14 +191,14 @@ class Model(ModelBase):
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(label, 'int64')
auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
label=label_int,
slide_steps=0)
label=label_int,
slide_steps=0)
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
......@@ -29,13 +29,15 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader")
self.train_data_path = envs.get_global_env("train_data_path", None,
"train.reader")
self.res = []
self.max_len = 0
data_file_list = os.listdir(self.train_data_path)
for i in range(0, len(data_file_list)):
train_data_file = os.path.join(self.train_data_path, data_file_list[i])
train_data_file = os.path.join(self.train_data_path,
data_file_list[i])
with open(train_data_file, "r") as fin:
for line in fin:
line = line.strip().split(';')
......@@ -78,11 +80,13 @@ class TrainReader(Reader):
len_array = [len(x[0]) for x in b]
mask = np.array(
[[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape(
[-1, max_len, 1])
[-1, max_len, 1])
target_item_seq = np.array(
[[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len])
[[x[2]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
target_cat_seq = np.array(
[[x[3]] * max_len for x in b]).astype("int64").reshape([-1, max_len])
[[x[3]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
res = []
for i in range(len(b)):
res.append([
......@@ -127,4 +131,5 @@ class TrainReader(Reader):
def generate_batch_from_trainfiles(self, files):
data_set = self.base_read(files)
random.shuffle(data_set)
return self.batch_reader(data_set, self.batch_size, self.batch_size * 20)
return self.batch_reader(data_set, self.batch_size,
self.batch_size * 20)
......@@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
......@@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
feature_name.append("label")
s = "click:" + str(label[0])
for i in dense_feature:
s += " dense_feature:" + str(i)
s += " dense_feature:" + str(i)
for i in range(1, 1 + len(categorical_range_)):
s += " " + str(i) + ":" + str(sparse_feature[i-1][0])
s += " " + str(i) + ":" + str(sparse_feature[i - 1][0])
print s.strip()
yield None
return reader
......
......@@ -31,8 +31,10 @@ class Model(ModelBase):
def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
def embedding_layer(input):
emb = fluid.layers.embedding(
......@@ -42,25 +44,27 @@ class Model(ModelBase):
size=[sparse_feature_number, sparse_feature_dim],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()),
)
emb_sum = fluid.layers.sequence_pool(
input=emb, pool_type='sum')
initializer=fluid.initializer.Uniform()), )
emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb_sum
def fc(input, output_size):
output = fluid.layers.fc(
input=input, size=output_size,
act='relu', param_attr=fluid.ParamAttr(
input=input,
size=output_size,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(input.shape[1]))))
return output
sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1)
concated = fluid.layers.concat(
sparse_embed_seq + [self.dense_input], axis=1)
fcs = [concated]
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
for size in hidden_layers:
fcs.append(fc(fcs[-1], size))
......@@ -75,14 +79,15 @@ class Model(ModelBase):
self.predict = predict
def avg_loss(self):
cost = fluid.layers.cross_entropy(input=self.predict, label=self.label_input)
cost = fluid.layers.cross_entropy(
input=self.predict, label=self.label_input)
avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost
def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
label=self.label_input,
num_thresholds=2 ** 12,
num_thresholds=2**12,
slide_steps=20)
self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc
......@@ -95,7 +100,8 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import io
import args
import pandas as pd
from sklearn import preprocessing
from sklearn import preprocessing
def _clean_file(source_path,target_path):
def _clean_file(source_path, target_path):
"""makes changes to match the CSV format."""
with io.open(source_path, 'r') as temp_eval_file:
with io.open(target_path, 'w') as eval_file:
......@@ -17,15 +32,16 @@ def _clean_file(source_path,target_path):
line = line[:-1]
line += '\n'
eval_file.write(line)
def build_model_columns(train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
# Load the dataset in Pandas
......@@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path):
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
categorical_columns = ['education','marital_status','relationship','workclass','occupation']
categorical_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation'
]
for col in categorical_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False)
base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets']
train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
crossed_columns = ['education_occupation','age_buckets_education_occupation']
test_df[col] = label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(),
bins,
labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(),
bins,
labels=False)
base_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation', 'age_buckets'
]
train_df['education_occupation'] = train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df[
'age_buckets'].astype(str) + '_' + train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df[
'age_buckets'].astype(str) + '_' + test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
crossed_columns = [
'education_occupation', 'age_buckets_education_occupation'
]
for col in crossed_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
test_df[col] = label_test.fit_transform(test_df[col])
wide_columns = base_columns + crossed_columns
train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns)
test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns)
train_df_temp = pd.get_dummies(
train_df[categorical_columns], columns=categorical_columns)
test_df_temp = pd.get_dummies(
test_df[categorical_columns], columns=categorical_columns)
train_df = train_df.join(train_df_temp)
test_df = test_df.join(test_df_temp)
deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week']
train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
deep_columns = list(train_df_temp.columns) + [
'age', 'education_num', 'capital_gain', 'capital_loss',
'hours_per_week'
]
train_df['label'] = train_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
with io.open('test_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
with io.open('test_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False)
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
train_data_path, index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
test_data_path, index=False)
def clean_file(train_path, test_path, train_data_path, test_data_path):
_clean_file(train_path, train_data_path)
_clean_file(test_path, test_data_path)
if __name__ == '__main__':
args = args.parse_args()
clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path)
clean_file(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
build_model_columns(args.train_data_path, args.test_data_path)
......@@ -20,6 +20,7 @@ except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
wide_feat, deep_deat, label = self._process_line(line)
s = ""
for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]:
for i in [('wide_input', wide_feat), ('deep_input', deep_deat),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
......@@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
......@@ -25,27 +25,27 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def wide_part(self, data):
out = fluid.layers.fc(input=data,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
out = fluid.layers.fc(
input=data,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
return out
def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1]))),
act=active,
name=tag)
output = fluid.layers.fc(
input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
act=active,
name=tag)
return output
......@@ -62,43 +62,63 @@ class Model(ModelBase):
deep_input = self._dense_data_var[1]
label = self._sparse_data_var[0]
hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units", 75, self._namespace)
hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units", 50, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", 25, self._namespace)
hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units",
75, self._namespace)
hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units",
50, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units",
25, self._namespace)
wide_output = self.wide_part(wide_input)
deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, hidden3_units)
wide_model = fluid.layers.fc(input=wide_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(input=deep_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_deep')
deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units,
hidden3_units)
wide_model = fluid.layers.fc(
input=wide_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(
input=deep_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
act=None,
name='w_deep')
prediction = fluid.layers.elementwise_add(wide_model, deep_model)
pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction")
pred = fluid.layers.sigmoid(
fluid.layers.clip(
prediction, min=-15.0, max=15.0),
name="prediction")
num_seqs = fluid.layers.create_tensor(dtype='int64')
acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=label, dtype='int64'), total=num_seqs)
auc_var, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=label, dtype='int64'))
acc = fluid.layers.accuracy(
input=pred,
label=fluid.layers.cast(
x=label, dtype='int64'),
total=num_seqs)
auc_var, batch_auc, auc_states = fluid.layers.auc(
input=pred, label=fluid.layers.cast(
x=label, dtype='int64'))
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc
self._metrics["ACC"] = acc
cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=fluid.layers.cast(label, dtype='float32'))
cost = fluid.layers.sigmoid_cross_entropy_with_logits(
x=prediction, label=fluid.layers.cast(
label, dtype='float32'))
avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -21,6 +21,7 @@ except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
......@@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator):
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
......@@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
......@@ -28,18 +28,22 @@ class Model(ModelBase):
loc=0.0, scale=init_value_)
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
feat_embeddings = fluid.embedding(
input=feat_idx,
......@@ -48,9 +52,9 @@ class Model(ModelBase):
size=[sparse_feature_number + 1, sparse_feature_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(initializer=initer))
feat_embeddings = fluid.layers.reshape(
feat_embeddings,
[-1, num_field, sparse_feature_dim]) # None * num_field * embedding_size
feat_embeddings = fluid.layers.reshape(feat_embeddings, [
-1, num_field, sparse_feature_dim
]) # None * num_field * embedding_size
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# -------------------- linear --------------------
......@@ -73,7 +77,8 @@ class Model(ModelBase):
# -------------------- CIN --------------------
layer_sizes_cin = envs.get_global_env("hyper_parameters.layer_sizes_cin", None, self._namespace)
layer_sizes_cin = envs.get_global_env(
"hyper_parameters.layer_sizes_cin", None, self._namespace)
Xs = [feat_embeddings]
last_s = num_field
for s in layer_sizes_cin:
......@@ -84,7 +89,8 @@ class Model(ModelBase):
1]) # None, embedding_size, num_field, 1
X_k = fluid.layers.reshape(
fluid.layers.transpose(Xs[-1], [0, 2, 1]),
[-1, sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s
[-1, sparse_feature_dim, 1,
last_s]) # None, embedding_size, 1, last_s
Z_k_1 = fluid.layers.matmul(
X_0, X_k) # None, embedding_size, num_field, last_s
......@@ -124,16 +130,19 @@ class Model(ModelBase):
# -------------------- DNN --------------------
layer_sizes_dnn = envs.get_global_env("hyper_parameters.layer_sizes_dnn", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
layer_sizes_dnn = envs.get_global_env(
"hyper_parameters.layer_sizes_dnn", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim])
for s in layer_sizes_dnn:
y_dnn = fluid.layers.fc(input=y_dnn,
size=s,
act=act,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(
input=y_dnn,
size=s,
act=act,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(input=y_dnn,
size=1,
act=None,
......@@ -148,7 +157,10 @@ class Model(ModelBase):
self.model._init_slots()
self.xdeepfm_net()
cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32"), epsilon=0.0000001)
cost = fluid.layers.log_loss(
input=self.predict,
label=fluid.layers.cast(self.label, "float32"),
epsilon=0.0000001)
batch_cost = fluid.layers.reduce_mean(cost)
self._cost = batch_cost
......@@ -162,7 +174,8 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir test_data
mv diginetica/test.txt test_data
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.input = []
self.length = None
......@@ -34,7 +35,8 @@ class EvaluateReader(Reader):
with open(f, "r") as fin:
for line in fin:
line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])]))
res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res
def make_data(self, cur_batch, batch_size):
......@@ -75,10 +77,8 @@ class EvaluateReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append(
[[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)])
......@@ -101,10 +101,13 @@ class EvaluateReader(Reader):
def _reader():
random.shuffle(self.input)
group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size])
for bg_id in range(0, self.length - group_remain,
batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True)
cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size)
......
......@@ -30,15 +30,21 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
self.items_num, self.ins_num = self.config_read(
envs.get_global_env("hyper_parameters.config_path", None, self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace)
envs.get_global_env("hyper_parameters.config_path", None,
self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.hidden_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps", None, self._namespace)
def config_read(self, config_path=None):
if config_path is None:
raise ValueError("please set train.model.hyper_parameters.config_path at first")
raise ValueError(
"please set train.model.hyper_parameters.config_path at first")
with open(config_path, "r") as fin:
item_nums = int(fin.readline().strip())
ins_nums = int(fin.readline().strip())
......@@ -46,100 +52,108 @@ class Model(ModelBase):
def input(self, bs):
self.items = fluid.data(
name="items",
shape=[bs, -1],
name="items", shape=[bs, -1],
dtype="int64") # [batch_size, uniq_max]
self.seq_index = fluid.data(
name="seq_index",
shape=[bs, -1, 2],
name="seq_index", shape=[bs, -1, 2],
dtype="int32") # [batch_size, seq_max, 2]
self.last_index = fluid.data(
name="last_index",
shape=[bs, 2],
dtype="int32") # [batch_size, 2]
name="last_index", shape=[bs, 2], dtype="int32") # [batch_size, 2]
self.adj_in = fluid.data(
name="adj_in",
shape=[bs, -1, -1],
name="adj_in", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.adj_out = fluid.data(
name="adj_out",
shape=[bs, -1, -1],
name="adj_out", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.mask = fluid.data(
name="mask",
shape=[bs, -1, 1],
name="mask", shape=[bs, -1, 1],
dtype="float32") # [batch_size, seq_max, 1]
self.label = fluid.data(
name="label",
shape=[bs, 1],
dtype="int64") # [batch_size, 1]
name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label]
res = [
self.items, self.seq_index, self.last_index, self.adj_in,
self.adj_out, self.mask, self.label
]
return res
def train_input(self):
res = self.input(self.train_batch_size)
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def net(self, items_num, hidden_size, step, bs):
stdv = 1.0 / math.sqrt(hidden_size)
def embedding_layer(input, table_name, emb_dim, initializer_instance=None):
def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None):
emb = fluid.embedding(
input=input,
size=[items_num, emb_dim],
param_attr=fluid.ParamAttr(
name=table_name,
initializer=initializer_instance),
)
name=table_name, initializer=initializer_instance), )
return emb
sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer)
items_emb = embedding_layer(self.items, "emb", hidden_size,
sparse_initializer)
pre_state = items_emb
for i in range(step):
pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
pre_state = layers.reshape(
x=pre_state, shape=[bs, -1, hidden_size])
state_in = layers.fc(
input=pre_state,
name="state_in",
size=hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_out = layers.fc(
input=pre_state,
name="state_out",
size=hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in,
state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(
self.adj_out, state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
gru_fc = layers.fc(
input=gru_input,
name="gru_fc",
size=3 * hidden_size,
bias_attr=False)
gru_input = layers.reshape(
x=gru_input, shape=[-1, hidden_size * 2])
gru_fc = layers.fc(input=gru_input,
name="gru_fc",
size=3 * hidden_size,
bias_attr=False)
pre_state, _, _ = fluid.layers.gru_unit(
input=gru_fc,
hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]),
hidden=layers.reshape(
x=pre_state, shape=[-1, hidden_size]),
size=3 * hidden_size)
final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
......@@ -153,24 +167,22 @@ class Model(ModelBase):
bias_attr=False,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(
input=last,
name="last_fc",
size=hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [bathc_size, h]
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(input=last,
name="last_fc",
size=hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [bathc_size, h]
seq_fc_t = layers.transpose(
seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h]
add = layers.elementwise_add(
seq_fc_t, last_fc) # [seq_max, batch_size, h]
add = layers.elementwise_add(seq_fc_t,
last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter(
shape=[hidden_size],
dtype='float32',
......@@ -188,12 +200,13 @@ class Model(ModelBase):
act=None,
num_flatten_dims=2,
bias_attr=False,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
weight *= self.mask
weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h]
weight_mask = layers.elementwise_mul(
seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(
weight_mask, dim=1) # [batch_size, h]
final_attention = layers.concat(
[global_attention, last], axis=1) # [batch_size, 2*h]
......@@ -213,7 +226,8 @@ class Model(ModelBase):
# persistable=True,
# name="all_vocab")
all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64')
all_vocab = fluid.layers.cast(
x=fluid.layers.assign(all_vocab), dtype='int64')
all_emb = fluid.embedding(
input=all_vocab,
......@@ -240,15 +254,19 @@ class Model(ModelBase):
def train_net(self):
self.train_input()
self.net(self.items_num, self.hidden_size, self.step, self.train_batch_size)
self.net(self.items_num, self.hidden_size, self.step,
self.train_batch_size)
self.avg_loss()
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
step_per_epoch = self.ins_num // self.train_batch_size
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace)
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
......@@ -266,10 +284,14 @@ class Model(ModelBase):
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size)
self.net(self.items_num, self.hidden_size, self.step,
self.evaluate_batch_size)
self._infer_results['acc'] = self.acc
self._infer_results['loss'] = self.loss
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import pickle
......@@ -10,6 +24,7 @@ parser.add_argument(
help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample')
opt = parser.parse_args()
def process_data(file_type):
path = os.path.join(opt.data_dir, file_type)
output_path = os.path.splitext(path)[0] + ".txt"
......@@ -23,6 +38,7 @@ def process_data(file_type):
fout.write(str(data[i][1]))
fout.write("\n")
process_data("train")
process_data("test")
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
import sys
import time
......
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.input = []
self.length = None
......@@ -34,7 +35,8 @@ class TrainReader(Reader):
with open(f, "r") as fin:
for line in fin:
line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])]))
res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res
def make_data(self, cur_batch, batch_size):
......@@ -75,10 +77,8 @@ class TrainReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append(
[[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)])
......@@ -101,10 +101,13 @@ class TrainReader(Reader):
def _reader():
random.shuffle(self.input)
group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size])
for bg_id in range(0, self.length - group_remain,
batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True)
cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size)
......
......@@ -24,14 +24,22 @@ class Model(ModelBase):
def all_vocab_network(self, is_infer=False):
""" network definition """
recall_k = envs.get_global_env("hyper_parameters.recall_k", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
hid_size = envs.get_global_env("hyper_parameters.hid_size", None, self._namespace)
init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound", None, self._namespace)
init_high_bound = envs.get_global_env("hyper_parameters.init_high_bound", None, self._namespace)
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None, self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None, self._namespace)
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None, self._namespace)
recall_k = envs.get_global_env("hyper_parameters.recall_k", None,
self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
hid_size = envs.get_global_env("hyper_parameters.hid_size", None,
self._namespace)
init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound",
None, self._namespace)
init_high_bound = envs.get_global_env(
"hyper_parameters.init_high_bound", None, self._namespace)
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None,
self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None,
self._namespace)
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None,
self._namespace)
# Input data
src_wordseq = fluid.data(
name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1)
......@@ -41,7 +49,10 @@ class Model(ModelBase):
if is_infer:
self._infer_data_var = [src_wordseq, dst_wordseq]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
emb = fluid.embedding(
input=src_wordseq,
......@@ -56,7 +67,8 @@ class Model(ModelBase):
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
low=init_low_bound,
high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册