提交 53c664a9 编写于 作者: C chengmo

fix clnflict

repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: v1.0.1
hooks:
- id: remove-crlf
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
- id: end-of-file-fixer
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
language: generic
sudo: required
dist: trusty
services:
- docker
os:
- linux
env:
- JOB=check_style
before_install:
# For pylint dockstring checker
- sudo pip install pylint pytest astroid isort pre-commit
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- "travis_wait 30 sleep 1800 &"
- |
# 43min timeout
tools/build_script.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
notifications:
email:
on_success: change
on_failure: always
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -27,6 +27,7 @@ from paddlerec.core.utils import envs
class ClusterEngine(Engine):
def __init_impl__(self):
abs_dir = os.path.dirname(os.path.abspath(__file__))
backend = envs.get_runtime_environ("engine_backend")
if backend == "PaddleCloud":
self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
......@@ -57,4 +58,5 @@ class ClusterEngine(Engine):
self.start_worker_procs()
else:
raise ValueError("role {} error, must in MASTER/WORKER".format(role))
raise ValueError("role {} error, must in MASTER/WORKER".format(
role))
......@@ -46,10 +46,13 @@ class LocalClusterEngine(Engine):
ports.append(new_port)
break
user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
user_endpoints_ips = [x.split(":")[0]
for x in user_endpoints.split(",")]
user_endpoints_port = [x.split(":")[1]
for x in user_endpoints.split(",")]
user_endpoints_ips = [
x.split(":")[0] for x in user_endpoints.split(",")
]
user_endpoints_port = [
x.split(":")[1] for x in user_endpoints.split(",")
]
factory = "paddlerec.core.factory"
cmd = [sys.executable, "-u", "-m", factory, self.trainer]
......@@ -97,8 +100,10 @@ class LocalClusterEngine(Engine):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].terminate()
print("all workers already completed, you can view logs under the `{}` directory".format(logs_dir),
file=sys.stderr)
print(
"all workers already completed, you can view logs under the `{}` directory".
format(logs_dir),
file=sys.stderr)
def run(self):
self.start_procs()
......@@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine
class LocalMPIEngine(Engine):
def start_procs(self):
logs_dir = self.envs["log_dir"]
default_env = os.environ.copy()
current_env = copy.copy(default_env)
current_env.pop("http_proxy", None)
......@@ -42,7 +41,8 @@ class LocalMPIEngine(Engine):
os.system("mkdir -p {}".format(logs_dir))
fn = open("%s/job.log" % logs_dir, "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
proc = subprocess.Popen(
cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd())
else:
proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd())
procs.append(proc)
......@@ -51,7 +51,9 @@ class LocalMPIEngine(Engine):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].wait()
print("all workers and parameter servers already completed", file=sys.stderr)
print(
"all workers and parameter servers already completed",
file=sys.stderr)
def run(self):
self.start_procs()
......@@ -19,24 +19,23 @@ import yaml
from paddlerec.core.utils import envs
trainer_abs = os.path.join(os.path.dirname(
os.path.abspath(__file__)), "trainers")
trainer_abs = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "trainers")
trainers = {}
def trainer_registry():
trainers["SingleTrainer"] = os.path.join(
trainer_abs, "single_trainer.py")
trainers["ClusterTrainer"] = os.path.join(
trainer_abs, "cluster_trainer.py")
trainers["CtrCodingTrainer"] = os.path.join(
trainer_abs, "ctr_coding_trainer.py")
trainers["CtrModulTrainer"] = os.path.join(
trainer_abs, "ctr_modul_trainer.py")
trainers["TDMSingleTrainer"] = os.path.join(
trainer_abs, "tdm_single_trainer.py")
trainers["TDMClusterTrainer"] = os.path.join(
trainer_abs, "tdm_cluster_trainer.py")
trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py")
trainers["ClusterTrainer"] = os.path.join(trainer_abs,
"cluster_trainer.py")
trainers["CtrCodingTrainer"] = os.path.join(trainer_abs,
"ctr_coding_trainer.py")
trainers["CtrModulTrainer"] = os.path.join(trainer_abs,
"ctr_modul_trainer.py")
trainers["TDMSingleTrainer"] = os.path.join(trainer_abs,
"tdm_single_trainer.py")
trainers["TDMClusterTrainer"] = os.path.join(trainer_abs,
"tdm_cluster_trainer.py")
trainer_registry()
......@@ -55,8 +54,8 @@ class TrainerFactory(object):
if trainer_abs is None:
if not os.path.isfile(train_mode):
raise IOError(
"trainer {} can not be recognized".format(train_mode))
raise IOError("trainer {} can not be recognized".format(
train_mode))
trainer_abs = train_mode
train_mode = "UserDefineTrainer"
......
......@@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric
class AUCMetric(Metric):
"""
Metric For Paddle Model
Metric For Fluid Model
"""
def __init__(self, config, fleet):
......@@ -83,7 +83,8 @@ class AUCMetric(Metric):
if scope.find_var(metric_item['var'].name) is None:
result[metric_name] = None
continue
result[metric_name] = self.get_metric(scope, metric_item['var'].name)
result[metric_name] = self.get_metric(scope,
metric_item['var'].name)
return result
def calculate_auc(self, global_pos, global_neg):
......@@ -178,14 +179,18 @@ class AUCMetric(Metric):
self._result['mean_q'] = 0
return self._result
if 'stat_pos' in result and 'stat_neg' in result:
result['auc'] = self.calculate_auc(result['stat_pos'], result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'], result['stat_neg'])
result['auc'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
result['bucket_error'] = self.calculate_auc(result['stat_pos'],
result['stat_neg'])
if 'pos_ins_num' in result:
result['actual_ctr'] = result['pos_ins_num'] / result['total_ins_num']
result['actual_ctr'] = result['pos_ins_num'] / result[
'total_ins_num']
if 'abserr' in result:
result['mae'] = result['abserr'] / result['total_ins_num']
if 'sqrerr' in result:
result['rmse'] = math.sqrt(result['sqrerr'] / result['total_ins_num'])
result['rmse'] = math.sqrt(result['sqrerr'] /
result['total_ins_num'])
if 'prob' in result:
result['predict_ctr'] = result['prob'] / result['total_ins_num']
if abs(result['predict_ctr']) > 1e-6:
......
......@@ -20,7 +20,7 @@ from paddlerec.core.utils import envs
class Model(object):
"""R
"""Base Model
"""
__metaclass__ = abc.ABCMeta
......@@ -38,6 +38,45 @@ class Model(object):
self._namespace = "train.model"
self._platform = envs.get_platform()
def _init_slots(self):
sparse_slots = envs.get_global_env("sparse_slots", None,
"train.reader")
dense_slots = envs.get_global_env("dense_slots", None, "train.reader")
if sparse_slots is not None or dense_slots is not None:
sparse_slots = sparse_slots.strip().split(" ")
dense_slots = dense_slots.strip().split(" ")
dense_slots_shape = [[
int(j) for j in i.split(":")[1].strip("[]").split(",")
] for i in dense_slots]
dense_slots = [i.split(":")[0] for i in dense_slots]
self._dense_data_var = []
for i in range(len(dense_slots)):
l = fluid.layers.data(
name=dense_slots[i],
shape=dense_slots_shape[i],
dtype="float32")
self._data_var.append(l)
self._dense_data_var.append(l)
self._sparse_data_var = []
for name in sparse_slots:
l = fluid.layers.data(
name=name, shape=[1], lod_level=1, dtype="int64")
self._data_var.append(l)
self._sparse_data_var.append(l)
dataset_class = envs.get_global_env("dataset_class", None,
"train.reader")
if dataset_class == "DataLoader":
self._init_dataloader()
def _init_dataloader(self):
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_inputs(self):
return self._data_var
......@@ -68,8 +107,8 @@ class Model(object):
"configured optimizer can only supported SGD/Adam/Adagrad")
if name == "SGD":
reg = envs.get_global_env(
"hyper_parameters.reg", 0.0001, self._namespace)
reg = envs.get_global_env("hyper_parameters.reg", 0.0001,
self._namespace)
optimizer_i = fluid.optimizer.SGD(
lr, regularization=fluid.regularizer.L2DecayRegularizer(reg))
elif name == "ADAM":
......@@ -83,10 +122,10 @@ class Model(object):
return optimizer_i
def optimizer(self):
learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate", None, self._namespace)
optimizer = envs.get_global_env(
"hyper_parameters.optimizer", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
self._namespace)
print(">>>>>>>>>>>.learnig rate: %s" % learning_rate)
return self._build_optimizer(optimizer, learning_rate)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -31,6 +31,7 @@ def create(config):
Model Instance
"""
model = None
if config['mode'] == 'fluid':
model = YamlModel(config)
model.train_net()
......@@ -50,7 +51,12 @@ class YamlModel(Model):
f = open(config['layer_file'], 'r')
self._build_nodes = yaml.safe_load(f.read())
self._build_phase = ['input', 'param', 'summary', 'layer']
self._build_param = {'layer': {}, 'inner_layer': {}, 'layer_extend': {}, 'model': {}}
self._build_param = {
'layer': {},
'inner_layer': {},
'layer_extend': {},
'model': {}
}
self._inference_meta = {'dependency': {}, 'params': {}}
def train_net(self):
......@@ -76,10 +82,12 @@ class YamlModel(Model):
if self._build_nodes[phase] is None:
continue
for node in self._build_nodes[phase]:
exec("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(self._config['mode'], self._build_param)
exec ("""layer=layer.{}(node)""".format(node['class']))
layer_output, extend_output = layer.generate(
self._config['mode'], self._build_param)
self._build_param['layer'][node['name']] = layer_output
self._build_param['layer_extend'][node['name']] = extend_output
self._build_param['layer_extend'][node[
'name']] = extend_output
if extend_output is None:
continue
if 'loss' in extend_output:
......@@ -89,17 +97,24 @@ class YamlModel(Model):
self._cost += extend_output['loss']
if 'data_var' in extend_output:
self._data_var += extend_output['data_var']
if 'metric_label' in extend_output and extend_output['metric_label'] is not None:
self._metrics[extend_output['metric_label']] = extend_output['metric_dict']
if 'metric_label' in extend_output and extend_output[
'metric_label'] is not None:
self._metrics[extend_output[
'metric_label']] = extend_output['metric_dict']
if 'inference_param' in extend_output:
inference_param = extend_output['inference_param']
param_name = inference_param['name']
if param_name not in self._build_param['table']:
self._build_param['table'][param_name] = {'params': []}
table_meta = table.TableMeta.alloc_new_table(inference_param['table_id'])
self._build_param['table'][param_name]['_meta'] = table_meta
self._build_param['table'][param_name]['params'] += inference_param['params']
self._build_param['table'][param_name] = {
'params': []
}
table_meta = table.TableMeta.alloc_new_table(
inference_param['table_id'])
self._build_param['table'][param_name][
'_meta'] = table_meta
self._build_param['table'][param_name][
'params'] += inference_param['params']
pass
@classmethod
......@@ -114,20 +129,25 @@ class YamlModel(Model):
metrics = params['metrics']
for name in metrics:
model_metrics = metrics[name]
stat_var_names += [model_metrics[metric]['var'].name for metric in model_metrics]
stat_var_names += [
model_metrics[metric]['var'].name
for metric in model_metrics
]
strategy['stat_var_names'] = list(set(stat_var_names))
optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \
'(learning_rate=' + str(optimizer_conf['learning_rate']) + ')'
exec(optimizer_generator)
exec (optimizer_generator)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
return optimizer
def dump_model_program(self, path):
"""R
"""
with open(path + '/' + self._name + '_main_program.pbtxt', "w") as fout:
with open(path + '/' + self._name + '_main_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['train_program']
with open(path + '/' + self._name + '_startup_program.pbtxt', "w") as fout:
with open(path + '/' + self._name + '_startup_program.pbtxt',
"w") as fout:
print >> fout, self._build_param['model']['startup_program']
pass
......@@ -137,7 +157,8 @@ class YamlModel(Model):
scope = params['scope']
decay = params['decay']
for param_table in self._build_param['table']:
table_id = self._build_param['table'][param_table]['_meta']._table_id
table_id = self._build_param['table'][param_table][
'_meta']._table_id
fleet.shrink_dense_table(decay, scope=scope, table_id=table_id)
def dump_inference_program(self, inference_layer, path):
......@@ -152,17 +173,25 @@ class YamlModel(Model):
executor = params['executor']
program = self._build_param['model']['train_program']
for table_name, table in self._build_param['table'].items():
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, table['params'])
fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id,
table['params'])
for infernce_item in params['inference_list']:
params_name_list = self.inference_params(infernce_item['layer_name'])
params_var_list = [program.global_block().var(i) for i in params_name_list]
params_name_list = self.inference_params(infernce_item[
'layer_name'])
params_var_list = [
program.global_block().var(i) for i in params_name_list
]
params_file_name = infernce_item['save_file_name']
with fluid.scope_guard(scope):
if params['save_combine']:
fluid.io.save_vars(executor, "./", \
program, vars=params_var_list, filename=params_file_name)
else:
fluid.io.save_vars(executor, params_file_name, program, vars=params_var_list)
fluid.io.save_vars(
executor,
params_file_name,
program,
vars=params_var_list)
def inference_params(self, inference_layer):
"""
......@@ -177,11 +206,13 @@ class YamlModel(Model):
return self._inference_meta['params'][layer]
self._inference_meta['params'][layer] = []
self._inference_meta['dependency'][layer] = self.get_dependency(self._build_param['inner_layer'], layer)
self._inference_meta['dependency'][layer] = self.get_dependency(
self._build_param['inner_layer'], layer)
for node in self._build_nodes['layer']:
if node['name'] not in self._inference_meta['dependency'][layer]:
continue
if 'inference_param' in self._build_param['layer_extend'][node['name']]:
if 'inference_param' in self._build_param['layer_extend'][node[
'name']]:
self._inference_meta['params'][layer] += \
self._build_param['layer_extend'][node['name']]['inference_param']['params']
return self._inference_meta['params'][layer]
......@@ -199,5 +230,6 @@ class YamlModel(Model):
dependencys = copy.deepcopy(layer_graph[dest_layer]['input'])
dependency_list = copy.deepcopy(dependencys)
for dependency in dependencys:
dependency_list = dependency_list + self.get_dependency(layer_graph, dependency)
dependency_list = dependency_list + self.get_dependency(
layer_graph, dependency)
return list(set(dependency_list))
......@@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer
class EmbeddingFuseLayer(Layer):
"""R
"""embedding + sequence + concat
"""
def __init__(self, config):
......@@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer):
show_clk.stop_gradient = True
data_var = []
for slot in self._slots:
l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1)
l = fluid.layers.data(
name=slot, shape=[1], dtype="int64", lod_level=1)
data_var.append(l)
emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \
is_sparse=True, is_distributed=True,
......@@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer):
emb = fluid.layers.sequence_pool(input=emb, pool_type='sum')
emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm)
self._emb_layers.append(emb)
output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name)
output = fluid.layers.concat(
input=self._emb_layers, axis=1, name=self._name)
return output, {'data_var': data_var}
......@@ -111,7 +113,13 @@ class ParamLayer(Layer):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}}
return self._config, {
'inference_param': {
'name': 'param',
'params': [],
'table_id': self._table_id
}
}
class SummaryLayer(Layer):
......@@ -129,10 +137,16 @@ class SummaryLayer(Layer):
def generate(self, param):
"""R
"""
return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}}
return self._config, {
'inference_param': {
'name': 'summary',
'params': [],
'table_id': self._table_id
}
}
class NormalizetionLayer(Layer):
class NormalizationLayer(Layer):
"""R
"""
......@@ -152,9 +166,19 @@ class NormalizetionLayer(Layer):
if len(self._input) > 0:
input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1)
bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={
"batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4})
inference_param = [self._name + '.batch_size', self._name + '.batch_sum', self._name + '.batch_square_sum']
bn = fluid.layers.data_norm(
input=input_layer,
name=self._name,
epsilon=1e-4,
param_attr={
"batch_size": 1e4,
"batch_sum_default": 0.0,
"batch_square": 1e4
})
inference_param = [
self._name + '.batch_size', self._name + '.batch_sum',
self._name + '.batch_square_sum'
]
return bn, {'inference_param': {'name': 'summary', \
'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}}
......@@ -181,11 +205,13 @@ class FCLayer(Layer):
input_list = [param['layer'][i] for i in self._input]
input_layer = fluid.layers.concat(input=input_list, axis=1)
input_coln = input_layer.shape[1]
scale = param_layer['init_range'] / (input_coln ** 0.5)
scale = param_layer['init_range'] / (input_coln**0.5)
bias = None
if self._bias:
bias = fluid.ParamAttr(learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale))
bias = fluid.ParamAttr(
learning_rate=1.0,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale))
fc = fluid.layers.fc(
name=self._name,
input=input_layer,
......@@ -216,18 +242,46 @@ class LogLossLayer(Layer):
self._extend_output = {
'metric_label': self._metric_label,
'metric_dict': {
'auc': {'var': None},
'batch_auc': {'var': None},
'stat_pos': {'var': None, 'data_type': 'int64'},
'stat_neg': {'var': None, 'data_type': 'int64'},
'batch_stat_pos': {'var': None, 'data_type': 'int64'},
'batch_stat_neg': {'var': None, 'data_type': 'int64'},
'pos_ins_num': {'var': None},
'abserr': {'var': None},
'sqrerr': {'var': None},
'prob': {'var': None},
'total_ins_num': {'var': None},
'q': {'var': None}
'auc': {
'var': None
},
'batch_auc': {
'var': None
},
'stat_pos': {
'var': None,
'data_type': 'int64'
},
'stat_neg': {
'var': None,
'data_type': 'int64'
},
'batch_stat_pos': {
'var': None,
'data_type': 'int64'
},
'batch_stat_neg': {
'var': None,
'data_type': 'int64'
},
'pos_ins_num': {
'var': None
},
'abserr': {
'var': None
},
'sqrerr': {
'var': None
},
'prob': {
'var': None
},
'total_ins_num': {
'var': None
},
'q': {
'var': None
}
}
}
......@@ -236,9 +290,12 @@ class LogLossLayer(Layer):
"""
input_layer = param['layer'][self._input[0]]
label_layer = param['layer'][self._label]
output = fluid.layers.clip(input_layer, self._bound[0], self._bound[1], name=self._name)
output = fluid.layers.clip(
input_layer, self._bound[0], self._bound[1], name=self._name)
norm = fluid.layers.sigmoid(output, name=self._name)
output = fluid.layers.log_loss(norm, fluid.layers.cast(x=label_layer, dtype='float32'))
output = fluid.layers.log_loss(
norm, fluid.layers.cast(
x=label_layer, dtype='float32'))
if self._weight:
weight_layer = param['layer'][self._weight]
output = fluid.layers.elementwise_mul(output, weight_layer)
......@@ -248,7 +305,11 @@ class LogLossLayer(Layer):
# For AUC Metric
metric = self._extend_output['metric_dict']
binary_predict = fluid.layers.concat(
input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1)
input=[
fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm),
norm
],
axis=1)
metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \
metric['batch_stat_neg']['var'], metric['stat_pos']['var'],
metric['stat_neg']['var']] = \
......
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import abc
......@@ -44,3 +45,65 @@ class Reader(dg.MultiSlotDataGenerator):
@abc.abstractmethod
def generate_sample(self, line):
pass
class SlotReader(dg.MultiSlotDataGenerator):
__metaclass__ = abc.ABCMeta
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
envs.set_global_envs(_config)
envs.update_workspace()
def init(self, sparse_slots, dense_slots, padding=0):
from operator import mul
self.sparse_slots = sparse_slots.strip().split(" ")
self.dense_slots = dense_slots.strip().split(" ")
self.dense_slots_shape = [
reduce(mul,
[int(j) for j in i.split(":")[1].strip("[]").split(",")])
for i in self.dense_slots
]
self.dense_slots = [i.split(":")[0] for i in self.dense_slots]
self.slots = self.dense_slots + self.sparse_slots
self.slot2index = {}
self.visit = {}
for i in range(len(self.slots)):
self.slot2index[self.slots[i]] = i
self.visit[self.slots[i]] = False
self.padding = padding
def generate_sample(self, l):
def reader():
line = l.strip().split(" ")
output = [(i, []) for i in self.slots]
for i in line:
slot_feasign = i.split(":")
slot = slot_feasign[0]
if slot not in self.slots:
continue
if slot in self.sparse_slots:
feasign = int(slot_feasign[1])
else:
feasign = float(slot_feasign[1])
output[self.slot2index[slot]][1].append(feasign)
self.visit[slot] = True
for i in self.visit:
slot = i
if not self.visit[slot]:
if i in self.dense_slots:
output[self.slot2index[i]][1].extend(
[self.padding] *
self.dense_slots_shape[self.slot2index[i]])
else:
output[self.slot2index[i]][1].extend([self.padding])
else:
self.visit[slot] = False
yield output
return reader
......@@ -30,8 +30,10 @@ class Trainer(object):
def __init__(self, config=None):
self._status_processor = {}
self._place = fluid.CPUPlace()
self._exe = fluid.Executor(self._place)
self._exector_context = {}
self._context = {'status': 'uninit', 'is_exit': False}
self._config_yaml = config
......@@ -95,6 +97,6 @@ def user_define_engine(engine_yaml):
train_dirname = os.path.dirname(train_location)
base_name = os.path.splitext(os.path.basename(train_location))[0]
sys.path.append(train_dirname)
trainer_class = envs.lazy_instance_by_fliename(
base_name, "UserDefineTraining")
trainer_class = envs.lazy_instance_by_fliename(base_name,
"UserDefineTraining")
return trainer_class
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
trainer implement.
......@@ -22,5 +21,3 @@ Trainer
↘ (for online learning training) OnlineLearningTrainer
"""
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor(
'train_pass', self.dataloader_train)
self.regist_context_processor('train_pass',
self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal)
......@@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer):
def init(self, context):
self.model.train_net()
optimizer = self.model.optimizer()
optimizer_name = envs.get_global_env(
"hyper_parameters.optimizer", None, "train.model")
optimizer_name = envs.get_global_env("hyper_parameters.optimizer",
None, "train.model")
if optimizer_name not in ["", "sgd", "SGD", "Sgd"]:
os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0'
......@@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer):
program = fluid.compiler.CompiledProgram(
fleet.main_program).with_data_parallel(
loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy())
loss_name=self.model.get_avg_cost().name,
build_strategy=self.strategy.get_build_strategy(),
exec_strategy=self.strategy.get_execute_strategy())
metrics_varnames = []
metrics_format = []
......@@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......@@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer):
for i in range(epochs):
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True)
fleet.stop_worker()
......
......@@ -59,8 +59,10 @@ class CtrTrainer(Trainer):
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None, namespace)
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
self._config_yaml)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -87,7 +89,8 @@ class CtrTrainer(Trainer):
self.model.train_net()
optimizer = self.model.optimizer()
optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False})
optimizer = fleet.distributed_optimizer(
optimizer, strategy={"use_cvm": False})
optimizer.minimize(self.model.get_avg_cost())
if fleet.is_server():
......@@ -118,16 +121,18 @@ class CtrTrainer(Trainer):
gs = shuf * 0
fleet._role_maker._node_type_comm.Allreduce(shuf, gs)
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(), fleet.worker_num(), gs))
print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(
), fleet.worker_num(), gs))
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
context['status'] = 'terminal_pass'
fleet.stop_worker()
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import json
import sys
......@@ -23,7 +22,6 @@ import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
from paddlerec.core.utils import fs as fs
from paddlerec.core.utils import util as util
from paddlerec.core.metrics.auc_metrics import AUCMetric
......@@ -80,20 +78,31 @@ class CtrTrainer(Trainer):
"""R
"""
Trainer.__init__(self, config)
config['output_path'] = util.get_absolute_path(
config['output_path'], config['io']['afs'])
config['output_path'] = util.get_absolute_path(config['output_path'],
config['io']['afs'])
self.global_config = config
self._metrics = {}
self._path_generator = util.PathGenerator({
'templates': [
{'name': 'xbox_base_done', 'template': config['output_path'] + '/xbox_base_done.txt'},
{'name': 'xbox_delta_done', 'template': config['output_path'] + '/xbox_patch_done.txt'},
{'name': 'xbox_base', 'template': config['output_path'] + '/xbox/{day}/base/'},
{'name': 'xbox_delta', 'template': config['output_path'] + '/xbox/{day}/delta-{pass_id}/'},
{'name': 'batch_model', 'template': config['output_path'] + '/batch_model/{day}/{pass_id}/'}
]
'templates': [{
'name': 'xbox_base_done',
'template': config['output_path'] + '/xbox_base_done.txt'
}, {
'name': 'xbox_delta_done',
'template': config['output_path'] + '/xbox_patch_done.txt'
}, {
'name': 'xbox_base',
'template': config['output_path'] + '/xbox/{day}/base/'
}, {
'name': 'xbox_delta',
'template':
config['output_path'] + '/xbox/{day}/delta-{pass_id}/'
}, {
'name': 'batch_model',
'template':
config['output_path'] + '/batch_model/{day}/{pass_id}/'
}]
})
if 'path_generator' in config:
self._path_generator.add_path_template(config['path_generator'])
......@@ -111,9 +120,11 @@ class CtrTrainer(Trainer):
if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu':
afs_config = self.global_config['io']['afs']
role_maker = GeneralRoleMaker(
hdfs_name=afs_config['fs_name'], hdfs_ugi=afs_config['fs_ugi'],
hdfs_name=afs_config['fs_name'],
hdfs_ugi=afs_config['fs_ugi'],
path=self.global_config['output_path'] + "/gloo",
init_timeout_seconds=1200, run_timeout_seconds=1200)
init_timeout_seconds=1200,
run_timeout_seconds=1200)
fleet.init(role_maker)
data_var_list = []
data_var_name_dict = {}
......@@ -125,7 +136,8 @@ class CtrTrainer(Trainer):
scope = fluid.Scope()
self._exector_context[executor['name']] = {}
self._exector_context[executor['name']]['scope'] = scope
self._exector_context[executor['name']]['model'] = model_basic.create(executor)
self._exector_context[executor['name']][
'model'] = model_basic.create(executor)
model = self._exector_context[executor['name']]['model']
self._metrics.update(model.get_metrics())
runnnable_scope.append(scope)
......@@ -146,9 +158,12 @@ class CtrTrainer(Trainer):
model = self._exector_context[executor['name']]['model']
program = model._build_param['model']['train_program']
if not executor['is_update_sparse']:
program._fleet_opt["program_configs"][str(id(model.get_avg_cost().block.program))]["push_sparse"] = []
program._fleet_opt["program_configs"][str(
id(model.get_avg_cost().block.program))][
"push_sparse"] = []
if 'train_thread_num' not in executor:
executor['train_thread_num'] = self.global_config['train_thread_num']
executor['train_thread_num'] = self.global_config[
'train_thread_num']
with fluid.scope_guard(scope):
self._exe.run(model._build_param['model']['startup_program'])
model.dump_model_program('./')
......@@ -162,7 +177,8 @@ class CtrTrainer(Trainer):
dataset_item['data_vars'] = data_var_list
dataset_item.update(self.global_config['io']['afs'])
dataset_item["batch_size"] = self.global_config['batch_size']
self._dataset[dataset_item['name']] = dataset.FluidTimeSplitDataset(dataset_item)
self._dataset[dataset_item[
'name']] = dataset.FluidTimeSplitDataset(dataset_item)
# if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
# util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3)
fleet.init_worker()
......@@ -190,23 +206,30 @@ class CtrTrainer(Trainer):
metric_param = {'label': metric, 'metric_dict': metrics[metric]}
metric_calculator.calculate(scope, metric_param)
metric_result = metric_calculator.get_result_to_string()
self.print_log(metric_result, {'master': True, 'stdout': stdout_str})
self.print_log(metric_result,
{'master': True,
'stdout': stdout_str})
monitor_data += metric_result
metric_calculator.clear(scope, metric_param)
def save_model(self, day, pass_index, base_key):
"""R
"""
cost_printer = util.CostPrinter(util.print_cost,
{'master': True, 'log_format': 'save model cost %s sec'})
model_path = self._path_generator.generate_path('batch_model', {'day': day, 'pass_id': pass_index})
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save model cost %s sec'
})
model_path = self._path_generator.generate_path(
'batch_model', {'day': day,
'pass_id': pass_index})
save_mode = 0 # just save all
if pass_index < 1: # batch_model
save_mode = 3 # unseen_day++, save all
util.rank0_print("going to save_model %s" % model_path)
fleet.save_persistables(None, model_path, mode=save_mode)
if fleet._role_maker.is_first_worker():
self._train_pass.save_train_progress(day, pass_index, base_key, model_path, is_checkpoint=True)
self._train_pass.save_train_progress(
day, pass_index, base_key, model_path, is_checkpoint=True)
cost_printer.done()
return model_path
......@@ -225,46 +248,58 @@ class CtrTrainer(Trainer):
if pass_index < 1:
save_mode = 2
xbox_patch_id = xbox_base_key
model_path = self._path_generator.generate_path('xbox_base', {'day': day})
xbox_model_donefile = self._path_generator.generate_path('xbox_base_done', {'day': day})
model_path = self._path_generator.generate_path('xbox_base',
{'day': day})
xbox_model_donefile = self._path_generator.generate_path(
'xbox_base_done', {'day': day})
else:
save_mode = 1
model_path = self._path_generator.generate_path('xbox_delta', {'day': day, 'pass_id': pass_index})
xbox_model_donefile = self._path_generator.generate_path('xbox_delta_done', {'day': day})
total_save_num = fleet.save_persistables(None, model_path, mode=save_mode)
model_path = self._path_generator.generate_path(
'xbox_delta', {'day': day,
'pass_id': pass_index})
xbox_model_donefile = self._path_generator.generate_path(
'xbox_delta_done', {'day': day})
total_save_num = fleet.save_persistables(
None, model_path, mode=save_mode)
cost_printer.done()
cost_printer = util.CostPrinter(util.print_cost, {'master': True,
'log_format': 'save cache model cost %s sec',
'stdout': stdout_str})
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save cache model cost %s sec',
'stdout': stdout_str
})
model_file_handler = fs.FileHandler(self.global_config['io']['afs'])
if self.global_config['save_cache_model']:
cache_save_num = fleet.save_cache_model(None, model_path, mode=save_mode)
cache_save_num = fleet.save_cache_model(
None, model_path, mode=save_mode)
model_file_handler.write(
"file_prefix:part\npart_num:16\nkey_num:%d\n" % cache_save_num,
model_path + '/000_cache/sparse_cache.meta', 'w')
cost_printer.done()
util.rank0_print("save xbox cache model done, key_num=%s" % cache_save_num)
util.rank0_print("save xbox cache model done, key_num=%s" %
cache_save_num)
save_env_param = {
'executor': self._exe,
'save_combine': True
}
cost_printer = util.CostPrinter(util.print_cost, {'master': True,
'log_format': 'save dense model cost %s sec',
'stdout': stdout_str})
save_env_param = {'executor': self._exe, 'save_combine': True}
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'save dense model cost %s sec',
'stdout': stdout_str
})
if fleet._role_maker.is_first_worker():
for executor in self.global_config['executor']:
if 'layer_for_inference' not in executor:
continue
executor_name = executor['name']
model = self._exector_context[executor_name]['model']
save_env_param['inference_list'] = executor['layer_for_inference']
save_env_param['scope'] = self._exector_context[executor_name]['scope']
save_env_param['inference_list'] = executor[
'layer_for_inference']
save_env_param['scope'] = self._exector_context[executor_name][
'scope']
model.dump_inference_param(save_env_param)
for dnn_layer in executor['layer_for_inference']:
model_file_handler.cp(dnn_layer['save_file_name'],
model_path + '/dnn_plugin/' + dnn_layer['save_file_name'])
model_path + '/dnn_plugin/' +
dnn_layer['save_file_name'])
fleet._role_maker._barrier_worker()
cost_printer.done()
......@@ -282,9 +317,15 @@ class CtrTrainer(Trainer):
"job_name": util.get_env_value("JOB_NAME")
}
if fleet._role_maker.is_first_worker():
model_file_handler.write(json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a')
model_file_handler.write(
json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a')
if pass_index > 0:
self._train_pass.save_train_progress(day, pass_index, xbox_base_key, model_path, is_checkpoint=False)
self._train_pass.save_train_progress(
day,
pass_index,
xbox_base_key,
model_path,
is_checkpoint=False)
fleet._role_maker._barrier_worker()
return stdout_str
......@@ -301,21 +342,28 @@ class CtrTrainer(Trainer):
util.rank0_print("Begin " + executor_name + " pass")
begin = time.time()
program = model._build_param['model']['train_program']
self._exe.train_from_dataset(program, dataset, scope,
thread=executor_config['train_thread_num'], debug=self.global_config['debug'])
self._exe.train_from_dataset(
program,
dataset,
scope,
thread=executor_config['train_thread_num'],
debug=self.global_config['debug'])
end = time.time()
local_cost = (end - begin) / 60.0
avg_cost = worker_numric_avg(local_cost)
min_cost = worker_numric_min(local_cost)
max_cost = worker_numric_max(local_cost)
util.rank0_print("avg train time %s mins, min %s mins, max %s mins" % (avg_cost, min_cost, max_cost))
util.rank0_print("avg train time %s mins, min %s mins, max %s mins"
% (avg_cost, min_cost, max_cost))
self._exector_context[executor_name]['cost'] = max_cost
monitor_data = ""
self.print_global_metrics(scope, model, monitor_data, stdout_str)
util.rank0_print("End " + executor_name + " pass")
if self._train_pass.need_dump_inference(pass_id) and executor_config['dump_inference_model']:
stdout_str += self.save_xbox_model(day, pass_id, xbox_base_key, monitor_data)
if self._train_pass.need_dump_inference(
pass_id) and executor_config['dump_inference_model']:
stdout_str += self.save_xbox_model(day, pass_id, xbox_base_key,
monitor_data)
fleet._role_maker._barrier_worker()
def startup(self, context):
......@@ -328,10 +376,14 @@ class CtrTrainer(Trainer):
stdout_str = ""
self._train_pass = util.TimeTrainPass(self.global_config)
if not self.global_config['cold_start']:
cost_printer = util.CostPrinter(util.print_cost,
{'master': True, 'log_format': 'load model cost %s sec',
'stdout': stdout_str})
self.print_log("going to load model %s" % self._train_pass._checkpoint_model_path, {'master': True})
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'load model cost %s sec',
'stdout': stdout_str
})
self.print_log("going to load model %s" %
self._train_pass._checkpoint_model_path,
{'master': True})
# if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= self._train_pass.date()
# and config.reqi_dnn_plugin_pass >= self._pass_id:
# fleet.load_one_table(0, self._train_pass._checkpoint_model_path)
......@@ -340,9 +392,12 @@ class CtrTrainer(Trainer):
cost_printer.done()
if self.global_config['save_first_base']:
self.print_log("save_first_base=True", {'master': True})
self.print_log("going to save xbox base model", {'master': True, 'stdout': stdout_str})
self.print_log("going to save xbox base model",
{'master': True,
'stdout': stdout_str})
self._train_pass._base_key = int(time.time())
stdout_str += self.save_xbox_model(self._train_pass.date(), 0, self._train_pass._base_key, "")
stdout_str += self.save_xbox_model(self._train_pass.date(), 0,
self._train_pass._base_key, "")
context['status'] = 'begin_day'
def begin_day(self, context):
......@@ -353,7 +408,9 @@ class CtrTrainer(Trainer):
context['is_exit'] = True
day = self._train_pass.date()
pass_id = self._train_pass._pass_id
self.print_log("======== BEGIN DAY:%s ========" % day, {'master': True, 'stdout': stdout_str})
self.print_log("======== BEGIN DAY:%s ========" % day,
{'master': True,
'stdout': stdout_str})
if pass_id == self._train_pass.max_pass_num_day():
context['status'] = 'end_day'
else:
......@@ -368,8 +425,10 @@ class CtrTrainer(Trainer):
context['status'] = 'begin_day'
util.rank0_print("shrink table")
cost_printer = util.CostPrinter(util.print_cost,
{'master': True, 'log_format': 'shrink table done, cost %s sec'})
cost_printer = util.CostPrinter(util.print_cost, {
'master': True,
'log_format': 'shrink table done, cost %s sec'
})
fleet.shrink_sparse_table()
for executor in self._exector_context:
self._exector_context[executor]['model'].shrink({
......@@ -394,7 +453,9 @@ class CtrTrainer(Trainer):
pass_id = self._train_pass._pass_id
base_key = self._train_pass._base_key
pass_time = self._train_pass._current_train_time.strftime("%Y%m%d%H%M")
self.print_log(" ==== begin delta:%s ========" % pass_id, {'master': True, 'stdout': stdout_str})
self.print_log(" ==== begin delta:%s ========" % pass_id,
{'master': True,
'stdout': stdout_str})
train_begin_time = time.time()
cost_printer = util.CostPrinter(util.print_cost, \
......@@ -403,35 +464,46 @@ class CtrTrainer(Trainer):
current_dataset = {}
for name in self._dataset:
current_dataset[name] = self._dataset[name].load_dataset({
'node_num': fleet.worker_num(), 'node_idx': fleet.worker_index(),
'begin_time': pass_time, 'time_window_min': self._train_pass._interval_per_pass
'node_num': fleet.worker_num(),
'node_idx': fleet.worker_index(),
'begin_time': pass_time,
'time_window_min': self._train_pass._interval_per_pass
})
fleet._role_maker._barrier_worker()
cost_printer.done()
util.rank0_print("going to global shuffle")
cost_printer = util.CostPrinter(util.print_cost, {
'master': True, 'stdout': stdout_str,
'log_format': 'global shuffle done, cost %s sec'})
'master': True,
'stdout': stdout_str,
'log_format': 'global shuffle done, cost %s sec'
})
for name in current_dataset:
current_dataset[name].global_shuffle(fleet, self.global_config['dataset']['shuffle_thread'])
current_dataset[name].global_shuffle(
fleet, self.global_config['dataset']['shuffle_thread'])
cost_printer.done()
# str(dataset.get_shuffle_data_size(fleet))
fleet._role_maker._barrier_worker()
if self.global_config['prefetch_data']:
next_pass_time = (self._train_pass._current_train_time +
datetime.timedelta(minutes=self._train_pass._interval_per_pass)).strftime("%Y%m%d%H%M")
next_pass_time = (
self._train_pass._current_train_time + datetime.timedelta(
minutes=self._train_pass._interval_per_pass)
).strftime("%Y%m%d%H%M")
for name in self._dataset:
self._dataset[name].preload_dataset({
'node_num': fleet.worker_num(), 'node_idx': fleet.worker_index(),
'begin_time': next_pass_time, 'time_window_min': self._train_pass._interval_per_pass
'node_num': fleet.worker_num(),
'node_idx': fleet.worker_index(),
'begin_time': next_pass_time,
'time_window_min': self._train_pass._interval_per_pass
})
fleet._role_maker._barrier_worker()
pure_train_begin = time.time()
for executor in self.global_config['executor']:
self.run_executor(executor, current_dataset[executor['dataset_name']], stdout_str)
self.run_executor(executor,
current_dataset[executor['dataset_name']],
stdout_str)
cost_printer = util.CostPrinter(util.print_cost, \
{'master': True, 'log_format': 'release_memory cost %s sec'})
for name in current_dataset:
......@@ -444,9 +516,11 @@ class CtrTrainer(Trainer):
train_end_time = time.time()
train_cost = train_end_time - train_begin_time
other_cost = train_cost - pure_train_cost
log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % (day, pass_id, train_cost)
log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % (
day, pass_id, train_cost)
for executor in self._exector_context:
log_str += '[' + executor + ':' + str(self._exector_context[executor]['cost']) + ']'
log_str += '[' + executor + ':' + str(self._exector_context[
executor]['cost']) + ']'
log_str += '[other_cost:' + str(other_cost) + ']'
util.rank0_print(log_str)
stdout_str += util.now_time_str() + log_str
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor(
'train_pass', self.dataloader_train)
self.regist_context_processor('train_pass',
self.dataloader_train)
self.regist_context_processor('infer_pass', self.infer)
self.regist_context_processor('terminal_pass', self.terminal)
......@@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer):
if state == "TRAIN":
inputs = self.model.get_inputs()
namespace = "train.reader"
train_data_path = envs.get_global_env(
"train_data_path", None, namespace)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
else:
inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader"
train_data_path = envs.get_global_env(
"test_data_path", None, namespace)
train_data_path = envs.get_global_env("test_data_path", None,
namespace)
threads = int(envs.get_runtime_environ("train.trainer.threads"))
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(
reader, reader_class, state, self._config_yaml)
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
self._config_yaml)
if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
train_data_path = os.path.join(
package_base, train_data_path.split("::")[1])
train_data_path = os.path.join(package_base,
train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer):
ins = self._get_dataset_ins()
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time-begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times))
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=True)
fleet.stop_worker()
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer):
self.regist_context_processor('uninit', self.instance)
self.regist_context_processor('init_pass', self.init)
self.regist_context_processor('startup_pass', self.startup)
if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None,
"train.reader") != "DataLoader":
if envs.get_platform() == "LINUX" and envs.get_global_env(
"dataset_class", None, "train.reader") != "DataLoader":
self.regist_context_processor('train_pass', self.dataset_train)
else:
self.regist_context_processor('train_pass', self.dataloader_train)
......@@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer):
reader = self._get_dataloader("TRAIN")
epochs = envs.get_global_env("train.epochs")
program = fluid.compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=self.model.get_avg_cost().name)
program = fluid.compiler.CompiledProgram(fluid.default_main_program(
)).with_data_parallel(loss_name=self.model.get_avg_cost().name)
metrics_varnames = []
metrics_format = []
......@@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......@@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer):
epochs = envs.get_global_env("train.epochs")
for i in range(epochs):
begin_time = time.time()
self._exe.train_from_dataset(program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
self._exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=dataset,
fetch_list=self.fetch_vars,
fetch_info=self.fetch_alias,
print_period=self.fetch_period)
end_time = time.time()
times = end_time - begin_time
print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times))
print("epoch {} using time {}, speed {:.2f} lines/s".format(
i, times, ins / times))
self.save(i, "train", is_fleet=False)
context['status'] = 'infer_pass'
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"]
class TDMClusterTrainer(ClusterTrainer):
def server(self, context):
namespace = "train.startup"
init_model_path = envs.get_global_env(
"cluster.init_model_path", "", namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
assert init_model_path != "", "Cluster train must has init_model for TDM"
fleet.init_server(init_model_path)
logger.info("TDM: load model from {}".format(init_model_path))
......@@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer):
self._exe.run(fleet.startup_program)
namespace = "train.startup"
load_tree = envs.get_global_env(
"tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", "", namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace)
self.tree_info_path = envs.get_global_env(
"tree.tree_info_path", "", namespace)
save_init_model = envs.get_global_env(
"cluster.save_init_model", False, namespace)
init_model_path = envs.get_global_env(
"cluster.init_model_path", "", namespace)
load_tree = envs.get_global_env("tree.load_tree", True, namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
namespace)
self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
"", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
namespace)
save_init_model = envs.get_global_env("cluster.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("cluster.init_model_path", "",
namespace)
if load_tree:
# covert tree to tensor, set it into Fluid's variable.
for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor()
param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name)
param_t.set(param_array.astype('int32'), self._place)
......@@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer):
def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format(
travel_array.shape[0]))
logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
0]))
return travel_array
def _tdm_layer_prepare(self):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with one node only.
"""
......@@ -27,33 +26,38 @@ from paddlerec.core.utils import envs
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer",
"TDM_Tree_Info", "TDM_Tree_Emb"]
special_param = [
"TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info", "TDM_Tree_Emb"
]
class TDMSingleTrainer(SingleTrainer):
def startup(self, context):
namespace = "train.startup"
load_persistables = envs.get_global_env(
"single.load_persistables", False, namespace)
load_persistables = envs.get_global_env("single.load_persistables",
False, namespace)
persistables_model_path = envs.get_global_env(
"single.persistables_model_path", "", namespace)
load_tree = envs.get_global_env(
"tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", "", namespace)
self.tree_travel_path = envs.get_global_env(
"tree.tree_travel_path", "", namespace)
self.tree_info_path = envs.get_global_env(
"tree.tree_info_path", "", namespace)
self.tree_emb_path = envs.get_global_env(
"tree.tree_emb_path", "", namespace)
save_init_model = envs.get_global_env(
"single.save_init_model", False, namespace)
init_model_path = envs.get_global_env(
"single.init_model_path", "", namespace)
load_tree = envs.get_global_env("tree.load_tree", False, namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "",
namespace)
self.tree_travel_path = envs.get_global_env("tree.tree_travel_path",
"", namespace)
self.tree_info_path = envs.get_global_env("tree.tree_info_path", "",
namespace)
self.tree_emb_path = envs.get_global_env("tree.tree_emb_path", "",
namespace)
save_init_model = envs.get_global_env("single.save_init_model", False,
namespace)
init_model_path = envs.get_global_env("single.init_model_path", "",
namespace)
self._exe.run(fluid.default_startup_program())
if load_persistables:
......@@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer):
if load_tree:
# covert tree to tensor, set it into Fluid's variable.
for param_name in special_param:
param_t = fluid.global_scope().find_var(param_name).get_tensor()
param_t = fluid.global_scope().find_var(param_name).get_tensor(
)
param_array = self._tdm_prepare(param_name)
if param_name == 'TDM_Tree_Emb':
param_t.set(param_array.astype('float32'), self._place)
......@@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer):
def _tdm_travel_prepare(self):
"""load tdm tree param from npy/list file"""
travel_array = np.load(self.tree_travel_path)
logger.info("TDM Tree leaf node nums: {}".format(
travel_array.shape[0]))
logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[
0]))
return travel_array
def _tdm_emb_prepare(self):
"""load tdm tree param from npy/list file"""
emb_array = np.load(self.tree_emb_path)
logger.info("TDM Tree node nums from emb: {}".format(
emb_array.shape[0]))
logger.info("TDM Tree node nums from emb: {}".format(emb_array.shape[
0]))
return emb_array
def _tdm_layer_prepare(self):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Training use fluid with DistributeTranspiler
"""
......@@ -23,6 +22,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import f
from paddlerec.core.trainer import Trainer
from paddlerec.core.utils import envs
from paddlerec.core.utils import dataloader_instance
from paddlerec.core.reader import SlotReader
class TranspileTrainer(Trainer):
......@@ -38,9 +38,12 @@ class TranspileTrainer(Trainer):
self.increment_models = []
def processor_register(self):
print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first")
print(
"Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first"
)
def _get_dataloader(self, state="TRAIN"):
if state == "TRAIN":
dataloader = self.model._data_loader
namespace = "train.reader"
......@@ -50,14 +53,24 @@ class TranspileTrainer(Trainer):
namespace = "evaluate.reader"
class_name = "EvaluateReader"
sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
dense_slots = envs.get_global_env("dense_slots", None, namespace)
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
print("batch_size: {}".format(batch_size))
reader = dataloader_instance.dataloader(
reader_class, state, self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class, class_name)
reader_ins = reader_class(self._config_yaml)
if sparse_slots is None and dense_slots is None:
reader_class = envs.get_global_env("class", None, namespace)
reader = dataloader_instance.dataloader(reader_class, state,
self._config_yaml)
reader_class = envs.lazy_instance_by_fliename(reader_class,
class_name)
reader_ins = reader_class(self._config_yaml)
else:
reader = dataloader_instance.slotdataloader("", state,
self._config_yaml)
reader_ins = SlotReader(self._config_yaml)
if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
dataloader.set_sample_list_generator(reader)
else:
......@@ -85,27 +98,37 @@ class TranspileTrainer(Trainer):
if state == "TRAIN":
inputs = self.model.get_inputs()
namespace = "train.reader"
train_data_path = envs.get_global_env(
"train_data_path", None, namespace)
train_data_path = envs.get_global_env("train_data_path", None,
namespace)
else:
inputs = self.model.get_infer_inputs()
namespace = "evaluate.reader"
train_data_path = envs.get_global_env(
"test_data_path", None, namespace)
train_data_path = envs.get_global_env("test_data_path", None,
namespace)
sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
dense_slots = envs.get_global_env("dense_slots", None, namespace)
threads = int(envs.get_runtime_environ("train.trainer.threads"))
batch_size = envs.get_global_env("batch_size", None, namespace)
reader_class = envs.get_global_env("class", None, namespace)
abs_dir = os.path.dirname(os.path.abspath(__file__))
reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
pipe_cmd = "python {} {} {} {}".format(
reader, reader_class, state, self._config_yaml)
if sparse_slots is None and dense_slots is None:
pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
self._config_yaml)
else:
padding = envs.get_global_env("padding", 0, namespace)
pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
reader, "slot", "slot", self._config_yaml, namespace, \
sparse_slots.replace(" ", "#"), dense_slots.replace(" ", "#"), str(padding))
if train_data_path.startswith("paddlerec::"):
package_base = envs.get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
train_data_path = os.path.join(
package_base, train_data_path.split("::")[1])
train_data_path = os.path.join(package_base,
train_data_path.split("::")[1])
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var(inputs)
......@@ -121,11 +144,11 @@ class TranspileTrainer(Trainer):
debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
if debug_mode:
print(
"--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0]))
print("--- Dataset Debug Mode Begin , show pre 10 data of {}---".
format(file_list[0]))
os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
print(
"--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0]))
print("--- Dataset Debug Mode End , show pre 10 data of {}---".
format(file_list[0]))
exit(0)
return dataset
......@@ -147,30 +170,29 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False):
return
# print("save inference model is not supported now.")
# return
feed_varnames = envs.get_global_env(
"save.inference.feed_varnames", None, namespace)
feed_varnames = envs.get_global_env("save.inference.feed_varnames",
None, namespace)
fetch_varnames = envs.get_global_env(
"save.inference.fetch_varnames", None, namespace)
if feed_varnames is None or fetch_varnames is None:
return
fetch_vars = [fluid.default_main_program().global_block().vars[varname]
for varname in fetch_varnames]
dirname = envs.get_global_env(
"save.inference.dirname", None, namespace)
fetch_vars = [
fluid.default_main_program().global_block().vars[varname]
for varname in fetch_varnames
]
dirname = envs.get_global_env("save.inference.dirname", None,
namespace)
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
if is_fleet:
fleet.save_inference_model(
self._exe, dirname, feed_varnames, fetch_vars)
fleet.save_inference_model(self._exe, dirname, feed_varnames,
fetch_vars)
else:
fluid.io.save_inference_model(
dirname, feed_varnames, fetch_vars, self._exe)
fluid.io.save_inference_model(dirname, feed_varnames,
fetch_vars, self._exe)
self.inference_models.append((epoch_id, dirname))
def save_persistables():
......@@ -180,8 +202,8 @@ class TranspileTrainer(Trainer):
if not need_save(epoch_id, save_interval, False):
return
dirname = envs.get_global_env(
"save.increment.dirname", None, namespace)
dirname = envs.get_global_env("save.increment.dirname", None,
namespace)
assert dirname is not None
dirname = os.path.join(dirname, str(epoch_id))
......@@ -259,10 +281,9 @@ class TranspileTrainer(Trainer):
batch_id = 0
try:
while True:
metrics_rets = self._exe.run(
program=program,
fetch_list=metrics_varnames,
return_numpy=is_return_numpy)
metrics_rets = self._exe.run(program=program,
fetch_list=metrics_varnames,
return_numpy=is_return_numpy)
metrics = [epoch, batch_id]
metrics.extend(metrics_rets)
......
......@@ -18,6 +18,7 @@ import os
from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.utils.envs import get_global_env
from paddlerec.core.utils.envs import get_runtime_environ
from paddlerec.core.reader import SlotReader
def dataloader(readerclass, train, yaml_file):
......@@ -62,3 +63,49 @@ def dataloader(readerclass, train, yaml_file):
if hasattr(reader, 'generate_batch_from_trainfiles'):
return gen_batch_reader()
return gen_reader
def slotdataloader(readerclass, train, yaml_file):
if train == "TRAIN":
reader_name = "SlotReader"
namespace = "train.reader"
data_path = get_global_env("train_data_path", None, namespace)
else:
reader_name = "SlotReader"
namespace = "evaluate.reader"
data_path = get_global_env("test_data_path", None, namespace)
if data_path.startswith("paddlerec::"):
package_base = get_runtime_environ("PACKAGE_BASE")
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
sparse = get_global_env("sparse_slots", None, namespace)
dense = get_global_env("dense_slots", None, namespace)
padding = get_global_env("padding", 0, namespace)
reader = SlotReader(yaml_file)
reader.init(sparse, dense, int(padding))
def gen_reader():
for file in files:
with open(file, 'r') as f:
for line in f:
line = line.rstrip('\n')
iter = reader.generate_sample(line)
for parsed_line in iter():
if parsed_line is None:
continue
else:
values = []
for pased in parsed_line:
values.append(pased[1])
yield values
def gen_batch_reader():
return reader.generate_batch_from_trainfiles(files)
if hasattr(reader, 'generate_batch_from_trainfiles'):
return gen_batch_reader()
return gen_reader
......@@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util
class DatasetHolder(object):
"""
Dataset Base
Dataset Holder
"""
__metaclass__ = abc.ABCMeta
......@@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder):
Dataset.__init__(self, config)
if 'data_donefile' not in config or config['data_donefile'] is None:
config['data_donefile'] = config['data_path'] + "/to.hadoop.done"
self._path_generator = util.PathGenerator({'templates': [
{'name': 'data_path', 'template': config['data_path']},
{'name': 'donefile_path', 'template': config['data_donefile']}
]})
self._split_interval = config['split_interval'] # data split N mins per dir
self._path_generator = util.PathGenerator({
'templates': [{
'name': 'data_path',
'template': config['data_path']
}, {
'name': 'donefile_path',
'template': config['data_donefile']
}]
})
self._split_interval = config[
'split_interval'] # data split N mins per dir
self._data_file_handler = fs.FileHandler(config)
def _format_data_time(self, daytime_str, time_window_mins):
......@@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder):
return None, 0
if mins_of_day % self._split_interval != 0:
skip_mins = self._split_interval - (mins_of_day % self._split_interval)
skip_mins = self._split_interval - (mins_of_day %
self._split_interval)
data_time = data_time + datetime.timedelta(minutes=skip_mins)
time_window_mins = time_window_mins - skip_mins
return data_time, time_window_mins
......@@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder):
True/False
"""
is_ready = True
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins)
data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0:
file_path = self._path_generator.generate_path('donefile_path', {'time_format': data_time})
file_path = self._path_generator.generate_path(
'donefile_path', {'time_format': data_time})
if not self._data_file_handler.is_exist(file_path):
is_ready = False
break
time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return is_ready
def get_file_list(self, daytime_str, time_window_mins, node_num=1, node_idx=0):
def get_file_list(self,
daytime_str,
time_window_mins,
node_num=1,
node_idx=0):
"""
data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx]
Args:
......@@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder):
list, data_shard[node_idx]
"""
data_file_list = []
data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins)
data_time, windows_mins = self._format_data_time(daytime_str,
time_window_mins)
while time_window_mins > 0:
file_path = self._path_generator.generate_path('data_path', {'time_format': data_time})
file_path = self._path_generator.generate_path(
'data_path', {'time_format': data_time})
sub_file_list = self._data_file_handler.ls(file_path)
for sub_file in sub_file_list:
sub_file_name = self._data_file_handler.get_file_name(sub_file)
if not sub_file_name.startswith(self._config['filename_prefix']):
if not sub_file_name.startswith(self._config[
'filename_prefix']):
continue
if hash(sub_file_name) % node_num == node_idx:
data_file_list.append(sub_file)
time_window_mins = time_window_mins - self._split_interval
data_time = data_time + datetime.timedelta(minutes=self._split_interval)
data_time = data_time + datetime.timedelta(
minutes=self._split_interval)
return data_file_list
def _alloc_dataset(self, file_list):
""" """
dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type'])
dataset = fluid.DatasetFactory().create_dataset(self._config[
'dataset_type'])
dataset.set_batch_size(self._config['batch_size'])
dataset.set_thread(self._config['load_thread'])
dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi'])
dataset.set_hdfs_config(self._config['fs_name'],
self._config['fs_ugi'])
dataset.set_pipe_command(self._config['data_converter'])
dataset.set_filelist(file_list)
dataset.set_use_var(self._config['data_vars'])
......@@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder):
while self.check_ready(begin_time, windown_min) == False:
print("dataset not ready, time:" + begin_time)
time.sleep(30)
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx'])
file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].load_into_memory()
else:
......@@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder):
windown_min = params['time_window_min']
if begin_time not in self._datasets:
if self.check_ready(begin_time, windown_min):
file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx'])
file_list = self.get_file_list(begin_time, windown_min,
params['node_num'],
params['node_idx'])
self._datasets[begin_time] = self._alloc_dataset(file_list)
self._datasets[begin_time].preload_into_memory(self._config['preload_thread'])
self._datasets[begin_time].preload_into_memory(self._config[
'preload_thread'])
return True
return False
......
......@@ -16,19 +16,34 @@ from __future__ import print_function
import sys
from paddlerec.core.utils.envs import lazy_instance_by_fliename
from paddlerec.core.reader import SlotReader
if len(sys.argv) != 4:
raise ValueError("reader only accept 3 argument: 1. reader_class 2.train/evaluate 3.yaml_abs_path")
if len(sys.argv) < 4:
raise ValueError(
"reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path"
)
reader_package = sys.argv[1]
if sys.argv[2] == "TRAIN":
if sys.argv[2].upper() == "TRAIN":
reader_name = "TrainReader"
else:
elif sys.argv[2].upper() == "EVALUATE":
reader_name = "EvaluateReader"
else:
reader_name = "SlotReader"
namespace = sys.argv[4]
sparse_slots = sys.argv[5].replace("#", " ")
dense_slots = sys.argv[6].replace("#", " ")
padding = int(sys.argv[7])
yaml_abs_path = sys.argv[3]
reader_class = lazy_instance_by_fliename(reader_package, reader_name)
reader = reader_class(yaml_abs_path)
reader.init()
reader.run_from_stdin()
if reader_name != "SlotReader":
reader_class = lazy_instance_by_fliename(reader_package, reader_name)
reader = reader_class(yaml_abs_path)
reader.init()
reader.run_from_stdin()
else:
reader = SlotReader(yaml_abs_path)
reader.init(sparse_slots, dense_slots, padding)
reader.run_from_stdin()
......@@ -95,7 +95,7 @@ def path_adapter(path):
l_p = path.split("paddlerec.")[1].replace(".", "/")
return os.path.join(package, l_p)
else:
return path
return path
def windows_path_converter(path):
......@@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None):
def lazy_instance_by_package(package, class_name):
models = get_global_env("train.model.models")
model_package = __import__(
package, globals(), locals(), package.split("."))
model_package = __import__(package,
globals(), locals(), package.split("."))
instance = getattr(model_package, class_name)
return instance
......@@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name):
sys.path.append(dirname)
package = os.path.splitext(os.path.basename(abs))[0]
model_package = __import__(
package, globals(), locals(), package.split("."))
model_package = __import__(package,
globals(), locals(), package.split("."))
instance = getattr(model_package, class_name)
return instance
......@@ -189,8 +189,7 @@ def get_platform():
def find_free_port():
def __free_port():
with closing(socket.socket(socket.AF_INET,
socket.SOCK_STREAM)) as s:
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
......
......@@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
def is_afs_path(path):
"""R
"""is_afs_path
"""
if path.startswith("afs") or path.startswith("hdfs"):
return True
......@@ -133,8 +133,9 @@ class FileHandler(object):
if mode.find('a') >= 0:
org_content = self._hdfs_client.cat(dest_path)
content = content + org_content
self._local_fs_client.write(content, temp_local_file,
mode) # fleet hdfs_client only support upload, so write tmp file
self._local_fs_client.write(
content, temp_local_file, mode
) # fleet hdfs_client only support upload, so write tmp file
self._hdfs_client.delete(dest_path + ".tmp")
self._hdfs_client.upload(dest_path + ".tmp", temp_local_file)
self._hdfs_client.delete(dest_path + ".bak")
......@@ -158,7 +159,8 @@ class FileHandler(object):
files = []
if is_afs_path(path):
files = self._hdfs_client.ls(path)
files = [path + '/' + self.get_file_name(fi) for fi in files] # absulte path
files = [path + '/' + self.get_file_name(fi)
for fi in files] # absulte path
else:
files = self._local_fs_client.ls(path)
files = [path + '/' + fi for fi in files] # absulte path
......
......@@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs
def save_program_proto(path, program=None):
if program is None:
_program = fluid.default_main_program()
else:
......@@ -175,7 +176,8 @@ class PathGenerator(object):
"""
if template_name in self._templates:
if 'time_format' in param:
str = param['time_format'].strftime(self._templates[template_name])
str = param['time_format'].strftime(self._templates[
template_name])
return str.format(**param)
return self._templates[template_name].format(**param)
else:
......@@ -198,31 +200,39 @@ class TimeTrainPass(object):
self._begin_day = make_datetime(day_fields[0].strip())
if len(day_fields) == 1 or len(day_fields[1]) == 0:
# 100 years, meaning to continuous running
self._end_day = self._begin_day + datetime.timedelta(days=36500)
self._end_day = self._begin_day + datetime.timedelta(
days=36500)
else:
# example: 2020212+10
run_day = int(day_fields[1].strip())
self._end_day = self._begin_day + datetime.timedelta(days=run_day)
self._end_day = self._begin_day + datetime.timedelta(
days=run_day)
else:
# example: {20191001..20191031}
days = os.popen("echo -n " + self._config['days']).read().split(" ")
days = os.popen("echo -n " + self._config['days']).read().split(
" ")
self._begin_day = make_datetime(days[0])
self._end_day = make_datetime(days[len(days) - 1])
self._checkpoint_interval = self._config['checkpoint_interval']
self._dump_inference_interval = self._config['dump_inference_interval']
self._interval_per_pass = self._config['train_time_interval'] # train N min data per pass
self._interval_per_pass = self._config[
'train_time_interval'] # train N min data per pass
self._pass_id = 0
self._inference_pass_id = 0
self._pass_donefile_handler = None
if 'pass_donefile_name' in self._config:
self._train_pass_donefile = global_config['output_path'] + '/' + self._config['pass_donefile_name']
self._train_pass_donefile = global_config[
'output_path'] + '/' + self._config['pass_donefile_name']
if fs.is_afs_path(self._train_pass_donefile):
self._pass_donefile_handler = fs.FileHandler(global_config['io']['afs'])
self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['afs'])
else:
self._pass_donefile_handler = fs.FileHandler(global_config['io']['local_fs'])
self._pass_donefile_handler = fs.FileHandler(global_config[
'io']['local_fs'])
last_done = self._pass_donefile_handler.cat(self._train_pass_donefile).strip().split('\n')[-1]
last_done = self._pass_donefile_handler.cat(
self._train_pass_donefile).strip().split('\n')[-1]
done_fileds = last_done.split('\t')
if len(done_fileds) > 4:
self._base_key = done_fileds[1]
......@@ -236,15 +246,18 @@ class TimeTrainPass(object):
"""
return 24 * 60 / self._interval_per_pass
def save_train_progress(self, day, pass_id, base_key, model_path, is_checkpoint):
def save_train_progress(self, day, pass_id, base_key, model_path,
is_checkpoint):
"""R
"""
if is_checkpoint:
self._checkpoint_pass_id = pass_id
self._checkpoint_model_path = model_path
done_content = "%s\t%s\t%s\t%s\t%d\n" % (day, base_key,
self._checkpoint_model_path, self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content, self._train_pass_donefile, 'a')
done_content = "%s\t%s\t%s\t%s\t%d\n" % (
day, base_key, self._checkpoint_model_path,
self._checkpoint_pass_id, pass_id)
self._pass_donefile_handler.write(done_content,
self._train_pass_donefile, 'a')
pass
def init_pass_by_id(self, date_str, pass_id):
......@@ -286,12 +299,14 @@ class TimeTrainPass(object):
if self._pass_id < 1:
self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M"))
else:
next_time = self._current_train_time + datetime.timedelta(minutes=self._interval_per_pass)
next_time = self._current_train_time + datetime.timedelta(
minutes=self._interval_per_pass)
if (next_time - self._end_day).total_seconds() > 0:
has_next = False
else:
self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M"))
if has_next and (self._inference_pass_id < self._pass_id or self._pass_id < old_pass_id):
if has_next and (self._inference_pass_id < self._pass_id or
self._pass_id < old_pass_id):
self._inference_pass_id = self._pass_id - 1
return has_next
......@@ -319,9 +334,11 @@ class TimeTrainPass(object):
Return:
date(current_train_time + delta_day)
"""
return (self._current_train_time + datetime.timedelta(days=delta_day)).strftime("%Y%m%d")
return (self._current_train_time + datetime.timedelta(days=delta_day)
).strftime("%Y%m%d")
def timestamp(self, delta_day=0):
"""R
"""
return (self._current_train_time + datetime.timedelta(days=delta_day)).timestamp()
return (self._current_train_time + datetime.timedelta(days=delta_day)
).timestamp()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# PaddleRec 贡献代码
> 占位
\ No newline at end of file
> 占位
# PaddleRec 推荐数据集格式
当你的数据集格式为[slot:feasign]*这种模式,或者可以预处理为这种格式时,可以直接使用PaddleRec内置的Reader。
好处是不用自己写Reader了,各个model之间的数据格式也都可以统一成一样的格式。
## 数据格式说明
假如你的原始数据格式为
```bash
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
```
其中```<label>```表示广告是否被点击,点击用1表示,未点击用0表示。```<integer feature>```代表数值特征(连续特征),共有13个连续特征。
并且每个特征有一个特征值。
```<categorical feature>```代表分类特征(离散特征),共有26个离散特征。相邻两个特征用```\t```分隔。
假设这13个连续特征(dense slot)的name如下:
```
D1 D2 D3 D4 D4 D6 D7 D8 D9 D10 D11 D12 D13
```
这26个离散特征(sparse slot)的name如下:
```
S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S20 S21 S22 S23 S24 S25 S26
```
那么下面这条样本(1个label + 13个dense值 + 26个feasign)
```
1 0.1 0.4 0.2 0.3 0.5 0.8 0.3 0.2 0.1 0.5 0.6 0.3 0.9 60 16 91 50 52 52 28 69 63 33 87 69 48 59 27 12 95 36 37 41 17 3 86 19 88 60
```
可以转换成:
```
label:1 D1:0.1 D2:0.4 D3:0.2 D4:0.3 D5:0.5 D6:0.8 D7:0.3 D8:0.2 D9:0.1 D10:0.5 D11:0.6 D12:0.3 D13:0.9 S14:60 S15:16 S16:91 S17:50 S18:52 S19:52 S20:28 S21:69 S22:63 S23:33 S24:87 S25:69 S26:48 S27:59 S28:27 S29:12 S30:95 S31:36 S32:37 S33:41 S34:17 S35:3 S36:86 S37:19 S38:88 S39:60
```
注意:上面各个slot:feasign字段之间的顺序没有要求,比如```D1:0.1 D2:0.4```改成```D2:0.4 D1:0.1```也可以。
## 配置
reader中需要配置```sparse_slots```与```dense_slots```,例如
```
workspace: xxxx
reader:
batch_size: 2
train_data_path: "{workspace}/data/train_data"
sparse_slots: "label S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S20 S21 S22 S23 S24 S25 S26"
dense_slots: "D1:1 D2:1 D3:1 D4:1 D4:1 D6:1 D7:1 D8:1 D9:1 D10:1 D11:1 D12:1 D13:1"
model:
xxxxx
```
sparse_slots表示稀疏特征的列表,以空格分开。
dense_slots表示稠密特征的列表,以空格分开。每个字段的格式是```[dense_slot_name]:[dim1,dim2,dim3...]```,其中```dim1,dim2,dim3...```表示shape
配置好了之后,这些slot对应的variable就可以在model中的如下变量啦:
```
self._sparse_data_var
self._dense_data_var
```
# PaddleRec 自定义数据集及Reader
用户自定义数据集及配置异步Reader,需要关注以下几个步骤:
......
......@@ -279,4 +279,4 @@ class Metric(object):
pass
```
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
\ No newline at end of file
全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py)
......@@ -7,5 +7,3 @@
### K8S集群运行分布式
> 占位
# 常见问题FAQ
> 占位
\ No newline at end of file
> 占位
# PaddleRec 单机训练
> 占位
\ No newline at end of file
> 占位
......@@ -12,4 +12,3 @@
| 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ |
| 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ |
# PaddleRec 模型调参
> 占位
\ No newline at end of file
> 占位
# PaddleRec 离线预测
\ No newline at end of file
# PaddleRec 离线预测
......@@ -5,4 +5,3 @@
## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -37,4 +37,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
......@@ -31,7 +31,8 @@ class Model(ModelBase):
def train_net(self):
""" network definition """
data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64')
data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
......@@ -51,7 +52,9 @@ class Model(ModelBase):
# full connect layer
fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim)
# softmax layer
prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax")
prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from paddlerec.core.reader import Reader
......@@ -38,7 +37,8 @@ class TrainReader(Reader):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)])
print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter
......@@ -71,13 +71,13 @@ python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test
### 训练
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification -d cpu -e single
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
```
### 预测
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification -d cpu -e single
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
```
## 效果对比
......@@ -87,19 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
## 分布式
### 模型训练性能 (样本/s)
| 数据集 | 模型 | 单机 | 同步 (4节点) | 同步 (8节点) | 同步 (16节点) | 同步 (32节点) |
| :------------------: | :--------------------: | :---------: |:---------: |:---------: |:---------: |:---------: |
| -- | TagSpace | -- | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- | -- |
----
| 数据集 | 模型 | 单机 | 异步 (4节点) | 异步 (8节点) | 异步 (16节点) | 异步 (32节点) |
| :------------------: | :--------------------: | :---------: |:---------: |:---------: |:---------: |:---------: |
| -- | TagSpace | -- | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- | -- |
......@@ -47,4 +47,3 @@ train:
dirname: "inference"
epoch_interval: 100
save_last: True
......@@ -26,8 +26,10 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
self.cost = None
self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace)
self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
self._namespace)
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
self._namespace)
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
self.win_size = envs.get_global_env("win_size", None, self._namespace)
......@@ -35,8 +37,9 @@ class Model(ModelBase):
self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
def train_net(self):
""" network definition """
text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64')
""" network"""
text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data(
......@@ -45,13 +48,19 @@ class Model(ModelBase):
self._data_var = [text, pos_tag, neg_tag]
text_emb = fluid.embedding(
input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb")
input=text,
size=[self.vocab_text_size, self.emb_dim],
param_attr="text_emb")
text_emb = fluid.layers.squeeze(input=text_emb, axes=[1])
pos_tag_emb = fluid.embedding(
input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb")
input=pos_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1])
neg_tag_emb = fluid.embedding(
input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb")
input=neg_tag,
size=[self.vocab_tag_size, self.emb_dim],
param_attr="tag_emb")
neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1])
conv_1d = fluid.nets.sequence_conv_pool(
......@@ -65,7 +74,8 @@ class Model(ModelBase):
size=self.emb_dim,
param_attr="text_hid")
cos_pos = nn.cos_sim(pos_tag_emb, text_hid)
mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb)
mul_text_hid = fluid.layers.sequence_expand_as(
x=text_hid, y=neg_tag_emb)
mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid)
cos_neg_all = fluid.layers.sequence_reshape(
input=mul_cos_neg, new_dim=self.neg_size)
......@@ -74,7 +84,10 @@ class Model(ModelBase):
#calculate hinge loss
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'),
input=cos_pos,
shape=[-1, 1],
value=self.margin,
dtype='float32'),
cos_pos)
loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
loss_part3 = nn.elementwise_max(
......@@ -85,7 +98,7 @@ class Model(ModelBase):
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less)
self.cost = avg_cost
self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos
......@@ -96,7 +109,8 @@ class Model(ModelBase):
return self.metrics
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import numpy as np
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -23,13 +23,26 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace)
Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i
in range(Neg)]
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
Neg = envs.get_global_env("hyper_parameters.NEG", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self.doc_negs = [
fluid.data(
name="doc_neg_" + str(i),
shape=[-1, TRIGRAM_D],
dtype="float32",
lod_level=0) for i in range(Neg)
]
self._data_var.append(self.query)
self._data_var.append(self.doc_pos)
for input in self.doc_negs:
......@@ -37,16 +50,24 @@ class Model(ModelBase):
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self, is_infer=False):
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace)
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
self._namespace)
def fc(data, hidden_layers, hidden_acts, names):
fc_inputs = [data]
for i in range(len(hidden_layers)):
xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i])
xavier = fluid.initializer.Xavier(
uniform=True,
fan_in=fc_inputs[-1].shape[1],
fan_out=hidden_layers[i])
out = fluid.layers.fc(input=fc_inputs[-1],
size=hidden_layers[i],
act=hidden_acts[i],
......@@ -56,8 +77,10 @@ class Model(ModelBase):
fc_inputs.append(out)
return fc_inputs[-1]
query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
query_fc = fc(self.query, hidden_layers, hidden_acts,
['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
if is_infer:
......@@ -65,13 +88,17 @@ class Model(ModelBase):
R_Q_D_ns = []
for i, doc_neg in enumerate(self.doc_negs):
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts,
['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)])
doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
'doc_neg_l3_' + str(i)
])
R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
concat_Rs = fluid.layers.concat(
input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
self.avg_cost = fluid.layers.mean(x=loss)
......@@ -91,18 +118,28 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.SGD(learning_rate)
return optimizer
def infer_input(self):
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace)
self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
self._namespace)
self.query = fluid.data(
name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
self.doc_pos = fluid.data(
name="doc_pos",
shape=[-1, TRIGRAM_D],
dtype='float32',
lod_level=0)
self._infer_data_var = [self.query, self.doc_pos]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
......
......@@ -22,4 +22,3 @@ mkdir -p data/train
mkdir -p data/test
python generate_synthetic_data.py
......@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model")
self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = []
for i in range(self.query_slots):
......
......@@ -21,7 +21,11 @@ class Dataset:
class SyntheticDataset(Dataset):
def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000):
def __init__(self,
sparse_feature_dim,
query_slot_num,
title_slot_num,
dataset_size=10000):
# ids are randomly generated
self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim
......@@ -46,14 +50,20 @@ class SyntheticDataset(Dataset):
for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
pt_slot = [str(fea) + ':' + str(i + self.query_slot_num) for fea in pt_slot]
pt_slot = [
str(fea) + ':' + str(i + self.query_slot_num)
for fea in pt_slot
]
pos_title_slots += pt_slot
if is_train:
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in
nt_slot]
nt_slot = [
str(fea) + ':' +
str(i + self.query_slot_num + self.title_slot_num)
for fea in nt_slot
]
neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots
else:
......@@ -76,7 +86,8 @@ if __name__ == '__main__':
query_slots = 1
title_slots = 1
dataset_size = 10
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size)
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
dataset_size)
train_reader = dataset.train()
test_reader = dataset.test()
......
......@@ -103,12 +103,18 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace)
title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace)
query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None, self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None, self._namespace)
query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
None, self._namespace)
title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
None, self._namespace)
query_encode_dim = envs.get_global_env(
"hyper_parameters.query_encode_dim", None, self._namespace)
title_encode_dim = envs.get_global_env(
"hyper_parameters.title_encode_dim", None, self._namespace)
query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
self._namespace)
title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
self._namespace)
factory = SimpleEncoderFactory()
self.query_encoders = [
factory.create(query_encoder, query_encode_dim)
......@@ -119,10 +125,13 @@ class Model(ModelBase):
for i in range(title_slots)
]
self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace)
self.emb_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
None, self._namespace)
self.emb_shape = [self.emb_size, self.emb_dim]
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
None, self._namespace)
self.margin = 0.1
def input(self, is_train=True):
......@@ -133,8 +142,10 @@ class Model(ModelBase):
]
self.pt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders)), shape=[None, 1], lod_level=1, dtype='int64')
for i in range(len(self.title_encoders))
name="%d" % (i + len(self.query_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
]
if is_train == False:
......@@ -142,9 +153,11 @@ class Model(ModelBase):
self.nt_slots = [
fluid.data(
name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1,
dtype='int64')
for i in range(len(self.title_encoders))
name="%d" %
(i + len(self.query_encoders) + len(self.title_encoders)),
shape=[None, 1],
lod_level=1,
dtype='int64') for i in range(len(self.title_encoders))
]
return self.q_slots + self.pt_slots + self.nt_slots
......@@ -153,11 +166,15 @@ class Model(ModelBase):
res = self.input()
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def get_acc(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
......@@ -190,10 +207,12 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
nt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(nt_embs)
]
# concat multi view for query, pos_title, neg_title
......@@ -252,7 +271,8 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
return optimizer
......@@ -261,7 +281,10 @@ class Model(ModelBase):
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
......@@ -281,7 +304,8 @@ class Model(ModelBase):
self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
]
pt_encodes = [
self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs)
self.title_encoders[i].forward(emb)
for i, emb in enumerate(pt_embs)
]
# concat multi view for query, pos_title, neg_title
q_concat = fluid.layers.concat(q_encodes)
......
......@@ -18,8 +18,10 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model")
self.query_slots = envs.get_global_env("hyper_parameters.query_slots",
None, "train.model")
self.title_slots = envs.get_global_env("hyper_parameters.title_slots",
None, "train.model")
self.all_slots = []
for i in range(self.query_slots):
......
......@@ -37,4 +37,3 @@
python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
all_field_id = [
'101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
......
......@@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader
class TrainReader(Reader):
def init(self):
all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128',
'129',
'205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
all_field_id = [
'101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124',
'125', '126', '127', '128', '129', '205', '206', '207', '210',
'216', '508', '509', '702', '853', '301'
]
self.all_field_id_dict = defaultdict(int)
for i, field_id in enumerate(all_field_id):
self.all_field_id_dict[field_id] = [False, i]
......
......@@ -28,11 +28,13 @@ class Model(ModelBase):
init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1])
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(loc=0.0,
scale=init_stddev * scales))
p_attr = fluid.param_attr.ParamAttr(
name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data,
size=out_dim,
......@@ -44,7 +46,11 @@ class Model(ModelBase):
def input_data(self):
sparse_input_ids = [
fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23)
fluid.data(
name="field_" + str(i),
shape=[-1, 1],
dtype="int64",
lod_level=1) for i in range(0, 23)
]
label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64")
label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64")
......@@ -55,19 +61,23 @@ class Model(ModelBase):
def net(self, inputs, is_infer=False):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
embed_size = envs.get_global_env("hyper_parameters.embed_size", None,
self._namespace)
emb = []
for data in inputs[0:-2]:
feat_emb = fluid.embedding(input=data,
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)
),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum')
feat_emb = fluid.embedding(
input=data,
size=[vocab_size, embed_size],
param_attr=fluid.ParamAttr(
name='dis_emb',
learning_rate=5,
initializer=fluid.initializer.Xavier(
fan_in=embed_size, fan_out=embed_size)),
is_sparse=True)
field_emb = fluid.layers.sequence_pool(
input=feat_emb, pool_type='sum')
emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1)
......@@ -85,14 +95,20 @@ class Model(ModelBase):
ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1]
ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctr_prop_one = fluid.layers.slice(
ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(
cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one,
cvr_prop_one)
ctcvr_prop = fluid.layers.concat(
input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(
input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(
input=ctcvr_prop, label=ctcvr_buy)
if is_infer:
self._infer_results["AUC_ctr"] = auc_ctr
......@@ -100,7 +116,8 @@ class Model(ModelBase):
return
loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
loss_ctcvr = fluid.layers.cross_entropy(
input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost)
......@@ -117,5 +134,8 @@ class Model(ModelBase):
def infer_net(self):
self._infer_data_var = self.input_data()
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True)
......@@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader
class EvaluateReader(Reader):
def init(self):
pass
def generate_sample(self, line):
......
......@@ -24,6 +24,7 @@ class TrainReader(Reader):
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
......
......@@ -23,44 +23,58 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def MMOE(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None, self._namespace)
gate_num = envs.get_global_env("hyper_parameters.gate_num", None, self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace)
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
feature_size = envs.get_global_env("hyper_parameters.feature_size",
None, self._namespace)
expert_num = envs.get_global_env("hyper_parameters.expert_num", None,
self._namespace)
gate_num = envs.get_global_env("hyper_parameters.gate_num", None,
self._namespace)
expert_size = envs.get_global_env("hyper_parameters.expert_size", None,
self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer:
self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital])
# f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper
expert_outputs = []
for i in range(0, expert_num):
expert_output = fluid.layers.fc(input=input_data,
size=expert_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_output = fluid.layers.fc(
input=input_data,
size=expert_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='expert_' + str(i))
expert_outputs.append(expert_output)
expert_concat = fluid.layers.concat(expert_outputs, axis=1)
expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size])
expert_concat = fluid.layers.reshape(expert_concat,
[-1, expert_num, expert_size])
# g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper
output_layers = []
for i in range(0, gate_num):
cur_gate = fluid.layers.fc(input=input_data,
size=expert_num,
act='softmax',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
cur_gate = fluid.layers.fc(
input=input_data,
size=expert_num,
act='softmax',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='gate_' + str(i))
# f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x))
cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.elementwise_mul(
expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1)
# Build tower layer
cur_tower = fluid.layers.fc(input=cur_gate_expert,
......@@ -74,25 +88,33 @@ class Model(ModelBase):
output_layers.append(out)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
pred_income = fluid.layers.clip(
output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(
label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(
label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost_income = fluid.layers.cross_entropy(
input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
avg_cost_income = fluid.layers.mean(x=cost_income)
avg_cost_marital = fluid.layers.mean(x=cost_marital)
......
......@@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm
| Census-income Data | Share-Bottom | -- | 0.93120/0.99256 |
| Census-income Data | MMoE | -- | 0.94465/0.99324 |
| Ali-CCP | ESMM | -- | 0.97181/0.49967 |
......@@ -24,27 +24,38 @@ class Model(ModelBase):
def model(self, is_infer=False):
feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace)
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None, self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace)
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None, self._namespace)
input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
feature_size = envs.get_global_env("hyper_parameters.feature_size",
None, self._namespace)
bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None,
self._namespace)
tower_size = envs.get_global_env("hyper_parameters.tower_size", None,
self._namespace)
tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None,
self._namespace)
input_data = fluid.data(
name="input", shape=[-1, feature_size], dtype="float32")
label_income = fluid.data(
name="label_income", shape=[-1, 2], dtype="float32", lod_level=0)
label_marital = fluid.data(
name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0)
if is_infer:
self._infer_data_var = [input_data, label_income, label_marital]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self._data_var.extend([input_data, label_income, label_marital])
bottom_output = fluid.layers.fc(input=input_data,
size=bottom_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
bottom_output = fluid.layers.fc(
input=input_data,
size=bottom_size,
act='relu',
bias_attr=fluid.ParamAttr(learning_rate=1.0),
name='bottom_output')
# Build tower layer from bottom layer
output_layers = []
......@@ -59,26 +70,34 @@ class Model(ModelBase):
name='output_layer_' + str(index))
output_layers.append(output_layer)
pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income,
label=fluid.layers.cast(x=label_income_1,
dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital,
label=fluid.layers.cast(x=label_marital_1,
dtype='int64'))
pred_income = fluid.layers.clip(
output_layers[0], min=1e-15, max=1.0 - 1e-15)
pred_marital = fluid.layers.clip(
output_layers[1], min=1e-15, max=1.0 - 1e-15)
label_income_1 = fluid.layers.slice(
label_income, axes=[1], starts=[1], ends=[2])
label_marital_1 = fluid.layers.slice(
label_marital, axes=[1], starts=[1], ends=[2])
auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(
input=pred_income,
label=fluid.layers.cast(
x=label_income_1, dtype='int64'))
auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(
input=pred_marital,
label=fluid.layers.cast(
x=label_marital_1, dtype='int64'))
if is_infer:
self._infer_results["AUC_income"] = auc_income
self._infer_results["AUC_marital"] = auc_marital
return
cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True)
cost_income = fluid.layers.cross_entropy(
input=pred_income, label=label_income, soft_label=True)
cost_marital = fluid.layers.cross_entropy(
input=pred_marital, label=label_marital, soft_label=True)
cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1)
avg_cost = fluid.layers.mean(x=cost)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -22,9 +22,10 @@ train:
reader:
batch_size: 2
class: "{workspace}/criteo_reader.py"
train_data_path: "{workspace}/data/train"
train_data_path: "{workspace}/data/slot_train"
feat_dict_name: "{workspace}/data/vocab"
sparse_slots: "label C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26"
dense_slots: "I1:1 I2:1 I3:1 I4:1 I5:1 I6:1 I7:1 I8:1 I9:1 I10:1 I11:1 I12:1 I13:1"
model:
models: "{workspace}/model.py"
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import io
......
......@@ -11,21 +11,32 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import math
import sys
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
import math
import os
try:
import cPickle as pickle
except ImportError:
import pickle
from collections import Counter
import os
import paddle.fluid.incubate.data_generator as dg
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
class TrainReader(Reader):
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
......@@ -48,7 +59,7 @@ class TrainReader(Reader):
self.cat_feat_idx_dict_list = [{} for _ in range(26)]
# TODO: set vocabulary dictionary
vocab_dir = envs.get_global_env("feat_dict_name", None, "train.reader")
vocab_dir = "./vocab/"
for i in range(26):
lookup_idx = 1 # remain 0 for default value
for line in open(
......@@ -72,11 +83,11 @@ class TrainReader(Reader):
if idx == 2 else math.log(1 + float(features[idx])))
for idx in self.cat_idx_:
if features[idx] == '' or features[
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
label_feat_list[idx].append(0)
else:
label_feat_list[idx].append(self.cat_feat_idx_dict_list[
idx - 14][features[idx]])
idx - 14][features[idx]])
label_feat_list[0].append(int(features[0]))
return label_feat_list
......@@ -87,6 +98,18 @@ class TrainReader(Reader):
def data_iter():
label_feat_list = self._process_line(line)
yield list(zip(self.label_feat_names, label_feat_list))
s = ""
for i in list(zip(self.label_feat_names, label_feat_list)):
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, absolute_import, division
import os
......
python download.py
python preprocess.py
mkdir slot_train
for i in `ls ./train`
do
cat train/$i | python get_slot_data.py > slot_train/$i
done
mkdir slot_test_valid
for i in `ls ./test_valid`
do
cat test_valid/$i | python get_slot_data.py > slot_test_valid/$i
done
......@@ -25,12 +25,23 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def init_network(self):
self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace)
self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace)
self.l2_reg_cross = envs.get_global_env("hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", None, self._namespace)
self.clip_by_norm = envs.get_global_env("hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", None, self._namespace)
self.cross_num = envs.get_global_env("hyper_parameters.cross_num",
None, self._namespace)
self.dnn_hidden_units = envs.get_global_env(
"hyper_parameters.dnn_hidden_units", None, self._namespace)
self.l2_reg_cross = envs.get_global_env(
"hyper_parameters.l2_reg_cross", None, self._namespace)
self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn",
None, self._namespace)
self.clip_by_norm = envs.get_global_env(
"hyper_parameters.clip_by_norm", None, self._namespace)
cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num",
None, self._namespace)
self.sparse_inputs = self._sparse_data_var[1:]
self.dense_inputs = self._dense_data_var
self.target_input = self._sparse_data_var[0]
cat_feat_dims_dict = OrderedDict()
for line in open(cat_feat_num):
spls = line.strip().split()
......@@ -38,10 +49,11 @@ class Model(ModelBase):
cat_feat_dims_dict[spls[0]] = int(spls[1])
self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict(
)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", None, self._namespace)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
None, self._namespace)
self.dense_feat_names = ['I' + str(i) for i in range(1, 14)]
self.sparse_feat_names = ['C' + str(i) for i in range(1, 27)]
self.dense_feat_names = [i.name for i in self.dense_inputs]
self.sparse_feat_names = [i.name for i in self.sparse_inputs]
# {feat_name: dims}
self.feat_dims_dict = OrderedDict(
......@@ -51,21 +63,20 @@ class Model(ModelBase):
self.net_input = None
self.loss = None
def _create_embedding_input(self, data_dict):
def _create_embedding_input(self):
# sparse embedding
sparse_emb_dict = OrderedDict((name, fluid.embedding(
input=fluid.layers.cast(
data_dict[name], dtype='int64'),
size=[
self.feat_dims_dict[name] + 1,
6 * int(pow(self.feat_dims_dict[name], 0.25))
],
is_sparse=self.is_sparse)) for name in self.sparse_feat_names)
sparse_emb_dict = OrderedDict()
for var in self.sparse_inputs:
sparse_emb_dict[var.name] = fluid.embedding(
input=var,
size=[
self.feat_dims_dict[var.name] + 1,
6 * int(pow(self.feat_dims_dict[var.name], 0.25))
],
is_sparse=self.is_sparse)
# combine dense and sparse_emb
dense_input_list = [
data_dict[name] for name in data_dict if name.startswith('I')
]
dense_input_list = self.dense_inputs
sparse_emb_list = list(sparse_emb_dict.values())
sparse_input = fluid.layers.concat(sparse_emb_list, axis=-1)
......@@ -111,17 +122,13 @@ class Model(ModelBase):
return fluid.layers.reduce_sum(fluid.layers.square(w))
def train_net(self):
self.model._init_slots()
self.init_network()
self.target_input = fluid.data(
name='label', shape=[None, 1], dtype='float32')
data_dict = OrderedDict()
for feat_name in self.feat_dims_dict:
data_dict[feat_name] = fluid.data(
name=feat_name, shape=[None, 1], dtype='float32')
self.net_input = self._create_embedding_input(data_dict)
self.net_input = self._create_embedding_input()
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False)
deep_out = self._deep_net(self.net_input, self.dnn_hidden_units,
self.dnn_use_bn, False)
cross_out, l2_reg_cross_loss = self._cross_net(self.net_input,
self.cross_num)
......@@ -130,9 +137,6 @@ class Model(ModelBase):
logit = fluid.layers.fc(last_out, 1)
self.prob = fluid.layers.sigmoid(logit)
self._data_var = [self.target_input] + [
data_dict[dense_name] for dense_name in self.dense_feat_names
] + [data_dict[sparse_name] for sparse_name in self.sparse_feat_names]
# auc
prob_2d = fluid.layers.concat([1 - self.prob, self.prob], 1)
......@@ -143,7 +147,9 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var
# logloss
logloss = fluid.layers.log_loss(self.prob, self.target_input)
logloss = fluid.layers.log_loss(
self.prob, fluid.layers.cast(
self.target_input, dtype='float32'))
self.avg_logloss = fluid.layers.reduce_mean(logloss)
# reg_coeff * l2_reg_cross
......@@ -152,9 +158,11 @@ class Model(ModelBase):
self._cost = self.loss
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
def infer_net(self, parameter_list):
self.model._init_slots()
self.deepfm_net()
......@@ -22,9 +22,10 @@ train:
reader:
batch_size: 2
class: "{workspace}/criteo_reader.py"
train_data_path: "{workspace}/data/train_data"
feat_dict_name: "{workspace}/data/aid_data/feat_dict_10.pkl2"
train_data_path: "{workspace}/data/slot_train_data"
feat_dict_name: "{workspace}/data/feat_dict_10.pkl2"
sparse_slots: "label feat_idx"
dense_slots: "feat_value:39"
model:
models: "{workspace}/model.py"
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -12,18 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
class TrainReader(Reader):
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
......@@ -37,7 +44,7 @@ class TrainReader(Reader):
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
# load preprocessed feature dict
self.feat_dict_name = envs.get_global_env("feat_dict_name", None, "train.reader")
self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line):
......@@ -70,6 +77,19 @@ class TrainReader(Reader):
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy
from collections import Counter
......
python download_preprocess.py
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
......@@ -27,31 +27,26 @@ class Model(ModelBase):
def deepfm_net(self):
init_value_ = 0.1
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field],
dtype='int64') # None * num_field(defalut:39)
raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32') # None * num_field
self.label = fluid.data(name='label', shape=[None, 1], dtype='float32') # None * 1
feat_idx = fluid.layers.reshape(raw_feat_idx, [-1, 1]) # (None * num_field) * 1
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
# ------------------------- set _data_var --------------------------
raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
self._data_var.append(raw_feat_idx)
self._data_var.append(raw_feat_value)
self._data_var.append(self.label)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
# ------------------------- first order term --------------------------
reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace)
reg = envs.get_global_env("hyper_parameters.reg", 1e-4,
self._namespace)
first_weights_re = fluid.embedding(
input=feat_idx,
is_sparse=True,
......@@ -65,7 +60,8 @@ class Model(ModelBase):
regularizer=fluid.regularizer.L1DecayRegularizer(reg)))
first_weights = fluid.layers.reshape(
first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1)
y_first_order = fluid.layers.reduce_sum((first_weights * feat_value),
1)
# ------------------------- second order term --------------------------
......@@ -78,7 +74,8 @@ class Model(ModelBase):
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormalInitializer(
loc=0.0, scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
loc=0.0,
scale=init_value_ / math.sqrt(float(sparse_feature_dim)))))
feat_embeddings = fluid.layers.reshape(
feat_embeddings_re,
shape=[-1, num_field,
......@@ -86,8 +83,8 @@ class Model(ModelBase):
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# sum_square part
summed_features_emb = fluid.layers.reduce_sum(feat_embeddings,
1) # None * embedding_size
summed_features_emb = fluid.layers.reduce_sum(
feat_embeddings, 1) # None * embedding_size
summed_features_emb_square = fluid.layers.square(
summed_features_emb) # None * embedding_size
......@@ -98,13 +95,16 @@ class Model(ModelBase):
squared_features_emb, 1) # None * embedding_size
y_second_order = 0.5 * fluid.layers.reduce_sum(
summed_features_emb_square - squared_sum_features_emb, 1,
summed_features_emb_square - squared_sum_features_emb,
1,
keep_dim=True) # None * 1
# ------------------------- DNN --------------------------
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim])
for s in layer_sizes:
......@@ -131,14 +131,17 @@ class Model(ModelBase):
# ------------------------- DeepFM --------------------------
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn)
self.predict = fluid.layers.sigmoid(y_first_order + y_second_order +
y_dnn)
def train_net(self):
self.model._init_slots()
self.deepfm_net()
# ------------------------- Cost(logloss) --------------------------
cost = fluid.layers.log_loss(input=self.predict, label=self.label)
cost = fluid.layers.log_loss(
input=self.predict, label=fluid.layers.cast(self.label, "float32"))
avg_cost = fluid.layers.reduce_sum(cost)
self._cost = avg_cost
......@@ -154,9 +157,11 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
def infer_net(self, parameter_list):
self.model._init_slots()
self.deepfm_net()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import pickle
import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import random
import pickle
......
......@@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def config_read(self, config_path):
with open(config_path, "r") as fin:
user_count = int(fin.readline().strip())
item_count = int(fin.readline().strip())
cat_count = int(fin.readline().strip())
return user_count, item_count, cat_count
def din_attention(self, hist, target_expand, mask):
"""activation weight"""
......@@ -58,56 +58,66 @@ class Model(ModelBase):
out = fluid.layers.matmul(weight, hist)
out = fluid.layers.reshape(x=out, shape=[0, hidden_size])
return out
def train_net(self):
seq_len = -1
self.item_emb_size = envs.get_global_env("hyper_parameters.item_emb_size", 64, self._namespace)
self.cat_emb_size = envs.get_global_env("hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", self._namespace)
self.item_emb_size = envs.get_global_env(
"hyper_parameters.item_emb_size", 64, self._namespace)
self.cat_emb_size = envs.get_global_env(
"hyper_parameters.cat_emb_size", 64, self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "sigmoid",
self._namespace)
#item_emb_size = 64
#cat_emb_size = 64
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", False, self._namespace)
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False, self._namespace)
#significant for speeding up the training process
self.config_path = envs.get_global_env("hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
self.config_path = envs.get_global_env(
"hyper_parameters.config_path", "data/config.txt", self._namespace)
self.use_DataLoader = envs.get_global_env(
"hyper_parameters.use_DataLoader", False, self._namespace)
user_count, item_count, cat_count = self.config_read(self.config_path)
item_emb_attr = fluid.ParamAttr(name="item_emb")
cat_emb_attr = fluid.ParamAttr(name="cat_emb")
hist_item_seq = fluid.data(
name="hist_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_item_seq)
hist_cat_seq = fluid.data(
name="hist_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(hist_cat_seq)
target_item = fluid.data(name="target_item", shape=[None], dtype="int64")
target_item = fluid.data(
name="target_item", shape=[None], dtype="int64")
self._data_var.append(target_item)
target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64")
self._data_var.append(target_cat)
label = fluid.data(name="label", shape=[None, 1], dtype="float32")
self._data_var.append(label)
mask = fluid.data(name="mask", shape=[None, seq_len, 1], dtype="float32")
mask = fluid.data(
name="mask", shape=[None, seq_len, 1], dtype="float32")
self._data_var.append(mask)
target_item_seq = fluid.data(
name="target_item_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_item_seq)
target_cat_seq = fluid.data(
name="target_cat_seq", shape=[None, seq_len], dtype="int64")
self._data_var.append(target_cat_seq)
if self.use_DataLoader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=10000, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=10000,
use_double_buffer=False,
iterable=False)
hist_item_emb = fluid.embedding(
input=hist_item_seq,
size=[item_count, self.item_emb_size],
......@@ -149,7 +159,8 @@ class Model(ModelBase):
size=[item_count, 1],
param_attr=fluid.initializer.Constant(value=0.0))
hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2)
hist_seq_concat = fluid.layers.concat(
[hist_item_emb, hist_cat_emb], axis=2)
target_seq_concat = fluid.layers.concat(
[target_item_seq_emb, target_cat_seq_emb], axis=2)
target_concat = fluid.layers.concat(
......@@ -157,21 +168,22 @@ class Model(ModelBase):
out = self.din_attention(hist_seq_concat, target_seq_concat, mask)
out_fc = fluid.layers.fc(name="out_fc",
input=out,
size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1)
input=out,
size=self.item_emb_size + self.cat_emb_size,
num_flatten_dims=1)
embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1)
fc1 = fluid.layers.fc(name="fc1",
input=embedding_concat,
size=80,
act=self.act)
input=embedding_concat,
size=80,
act=self.act)
fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act)
fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1)
logit = fc3 + item_b
loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logit, label=label)
avg_loss = fluid.layers.mean(loss)
self._cost = avg_loss
......@@ -179,14 +191,14 @@ class Model(ModelBase):
predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
label_int = fluid.layers.cast(label, 'int64')
auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
label=label_int,
slide_steps=0)
label=label_int,
slide_steps=0)
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
......
......@@ -29,13 +29,15 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader")
self.train_data_path = envs.get_global_env("train_data_path", None,
"train.reader")
self.res = []
self.max_len = 0
data_file_list = os.listdir(self.train_data_path)
for i in range(0, len(data_file_list)):
train_data_file = os.path.join(self.train_data_path, data_file_list[i])
train_data_file = os.path.join(self.train_data_path,
data_file_list[i])
with open(train_data_file, "r") as fin:
for line in fin:
line = line.strip().split(';')
......@@ -78,11 +80,13 @@ class TrainReader(Reader):
len_array = [len(x[0]) for x in b]
mask = np.array(
[[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape(
[-1, max_len, 1])
[-1, max_len, 1])
target_item_seq = np.array(
[[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len])
[[x[2]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
target_cat_seq = np.array(
[[x[3]] * max_len for x in b]).astype("int64").reshape([-1, max_len])
[[x[3]] * max_len for x in b]).astype("int64").reshape(
[-1, max_len])
res = []
for i in range(len(b)):
res.append([
......@@ -127,4 +131,5 @@ class TrainReader(Reader):
def generate_batch_from_trainfiles(self, files):
data_set = self.base_read(files)
random.shuffle(data_set)
return self.batch_reader(data_set, self.batch_size, self.batch_size * 20)
return self.batch_reader(data_set, self.batch_size,
self.batch_size * 20)
......@@ -23,9 +23,10 @@ train:
reader:
batch_size: 2
class: "{workspace}/../criteo_reader.py"
train_data_path: "{workspace}/data/train"
train_data_path: "{workspace}/data/slot_train_data"
reader_debug_mode: False
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13"
model:
models: "{workspace}/model.py"
......
wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
tar -zxvf ctr_data.tar.gz
mv ./raw_data ./train_data_full
mkdir train_data && cd train_data
cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd ..
mv ./test_data ./test_data_full
mkdir test_data && cd test_data
cp ../test_data_full/part-220 ./ && cd ..
echo "Complete data download."
echo "Full Train data stored in ./train_data_full "
echo "Full Test data stored in ./test_data_full "
echo "Rapid Verification train data stored in ./train_data "
echo "Rapid Verification test data stored in ./test_data "
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,20 +12,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.fluid.incubate.data_generator as dg
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
hash_dim_ = 1000001
continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
class TrainReader(Reader):
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
self.hash_dim_ = envs.get_global_env("hyper_parameters.sparse_feature_number", None, "train.model")
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
class CriteoDataset(dg.MultiSlotDataGenerator):
"""
DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading
Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675
"""
def generate_sample(self, line):
"""
......@@ -37,25 +38,34 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features = line.rstrip('\n').split('\t')
dense_feature = []
sparse_feature = []
for idx in self.continuous_range_:
for idx in continuous_range_:
if features[idx] == "":
dense_feature.append(0.0)
else:
dense_feature.append(
(float(features[idx]) - self.cont_min_[idx - 1]) /
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
(float(features[idx]) - cont_min_[idx - 1]) /
cont_diff_[idx - 1])
for idx in categorical_range_:
sparse_feature.append(
[hash(str(idx) + features[idx]) % self.hash_dim_])
[hash(str(idx) + features[idx]) % hash_dim_])
label = [int(features[0])]
feature_name = ["D"]
for idx in self.categorical_range_:
feature_name.append("S" + str(idx - 13))
process_line = dense_feature, sparse_feature, label
feature_name = ["dense_feature"]
for idx in categorical_range_:
feature_name.append("C" + str(idx - 13))
feature_name.append("label")
yield zip(feature_name, [dense_feature] + sparse_feature + [label])
s = "click:" + str(label[0])
for i in dense_feature:
s += " dense_feature:" + str(i)
for i in range(1, 1 + len(categorical_range_)):
s += " " + str(i) + ":" + str(sparse_feature[i - 1][0])
print s.strip()
yield None
return reader
d = CriteoDataset()
d.run_from_stdin()
sh download.sh
mkdir slot_train_data_full
for i in `ls ./train_data_full`
do
cat train_data_full/$i | python get_slot_data.py > slot_train_data_full/$i
done
mkdir slot_test_data_full
for i in `ls ./test_data_full`
do
cat test_data_full/$i | python get_slot_data.py > slot_test_data_full/$i
done
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
0 1 1 26 30 0 4 2 37 152 1 2 2 05db9164 38d50e09 ed5e4936 612ccfd4 25c83c98 38eb9cf4 1f89b562 a73ee510 2462946f 7f8ffe57 1d5d5b6e 46f42a63 b28479f6 7501d6be 6083e1d5 07c540c4 f855e3f0 21ddcdc9 5840adea 782e846e 32c7478e b2f178a3 001f3601 c4304c4b
0 20 3 4 40479 444 0 1 157 0 4 68fd1e64 09e68b86 aa8c1539 85dd697c 25c83c98 fe6b92e5 e56a4862 5b392875 a73ee510 3b08e48b 5e183c58 d8c29807 1eb0f8f0 8ceecbc8 d2f03b75 c64d548f 07c540c4 63cdbb21 cf99e5de 5840adea 5f957280 55dd3565 1793a828 e8b83407 b7d9c3bc
0 6 70 1 22 312 25 52 44 144 1 3 1 22 05db9164 04e09220 b1ecc6c4 5dff9b29 4cf72387 7e0ccccf d5f62b87 1f89b562 a73ee510 ce92c282 434d6c13 2436ff75 7301027a 07d13a8f f6b23a53 f4ead43c 3486227d 6fc84bfb 4f1aa25f c9d4222a 55dd3565 ded4aac9
0 0 0 110 7 3251 44 1 32 39 0 1 31 05db9164 80e26c9b ba1947d0 85dd697c 25c83c98 85f287b3 0b153874 a73ee510 89270478 7c53dc69 34a238e0 4fd35e8f 1adce6ef 0f942372 da441c7e d4bb7bd8 005c6740 21ddcdc9 5840adea 8717ea07 423fab69 1793a828 e8b83407 9904c656
0 0 29 19490 0 68fd1e64 287130e0 ba4559ea 33a72095 25c83c98 fbad5c96 ffdbd799 5b392875 a73ee510 60badee3 c72ca7a4 ebfb225c b9be5035 cfef1c29 655fad18 a9dcda12 d4bb7bd8 891589e7 419b4cef 5840adea 76ef8858 32c7478e 135c8b41 ea9a246c e3a60438
0 2 2 20 43197 0 26 23 0 25 05db9164 9b6b8959 9c1c85e7 fd4d6dc3 25c83c98 7e0ccccf d2d741ca 0b153874 a73ee510 4e2d1b78 ea4adb47 cc239583 05781932 64c94865 de781d57 efd92064 e5ba7672 cac48684 4b0ac19f c9d4222a 3a171ecb 22dd4e42
1 5 12 871 0 27 1 21 1 4 0 05db9164 e112a9de 29bb7bea d3e15e1a 25c83c98 7e0ccccf fd3483f3 0b153874 a73ee510 880e2781 9d7e66c3 bd5829ab df957573 07d13a8f 290e3042 390b7737 8efede7f 808e7bc3 af16dda0 ad3062eb 423fab69 a0ab2ce0
0 6 263 41 53 0 44 42 0 42 05db9164 71ca0a25 ad876a43 0481f0ba 4cf72387 7e0ccccf bb0f47fb 5b392875 a73ee510 3b08e48b da3f45ff fde18531 a9fda8f5 07d13a8f a8e0f0c6 06f4ae56 776ce399 9bf8ffef 21ddcdc9 5840adea f5f07930 be7c41b4 62aa24c6 001f3601 1d5d3a57
0 14 4301 48 2 3 51 2 68fd1e64 95e2d337 95c48c52 30b862e7 25c83c98 7e0ccccf b06857f8 0b153874 a73ee510 8228dde1 e4eb05d4 f0d5cc59 a4c5d6dd 1adce6ef 559cd202 e9194f3c 07c540c4 7b06fafe 21ddcdc9 a458ea53 cb105f80 423fab69 16bb3de8 2bf691b1 d3b2f8c3
0 58 42 39 100 0 40 40 0 40 05db9164 207b2d81 25c37040 e8b2aee5 25c83c98 fe6b92e5 6e6e841b 1f89b562 a73ee510 3b08e48b dcc0e16b a04dc78a b093e98d b28479f6 c6438ddb 31da84fc 776ce399 fa0643ee 21ddcdc9 b1252a9d 931d653d c9d4222a be7c41b4 46f5e7df 001f3601 0e25d9c4
0 0 2 3 46065 0 5 9 0 0 3 68fd1e64 b961056b 05eefcc3 65e58ae6 25c83c98 fbad5c96 68fbb662 0b153874 7cc72ec2 4aead435 922bbb91 10239ea6 ad61640d 1adce6ef 8187184a 551eb463 e5ba7672 5a6878f5 00018438 32c7478e 71292dbb
1 1 0 224 0 4 0 3 4 27 1 2 0 05db9164 09e68b86 aa8c1539 85dd697c 25c83c98 7e0ccccf a4a8fd5a 0b153874 a73ee510 43a9b300 d13e1160 d8c29807 45820f61 b28479f6 2d49999f c64d548f e5ba7672 63cdbb21 cf99e5de 5840adea 5f957280 bcdee96c 1793a828 e8b83407 b7d9c3bc
1 10 310 6 5 3 75 4 702 2 21 3 68fd1e64 3f0d3f28 4cf72387 7e0ccccf a097ff18 062b5529 a73ee510 ae07e31d 3407cf7b f0fe287d 1adce6ef 14108df6 27c07bd6 88416823 ad3062eb 3a171ecb
0 0 0 19 2898 145 4 20 370 0 2 43 05db9164 38a947a1 0797f900 1da94763 25c83c98 fbad5c96 ba0ca6c5 64523cfa a73ee510 56ae5fb0 7ca01a9d c8ea9acc 97d749c9 1adce6ef a3dc522e d1079e54 e5ba7672 492bb129 828187a0 32c7478e 171ccf3e
0 16 4 2 46248 0 2 49 0 2 05db9164 942f9a8d feafff7d d7b693da 25c83c98 7e0ccccf d9aa9d97 5b392875 7cc72ec2 3b08e48b c4adf918 4ebd8ffe 85dbe138 b28479f6 ac182643 48292aa0 776ce399 1f868fdd 21ddcdc9 b1252a9d be7cac53 32c7478e e3edc57b 9d93af03 7dfad416
1 66 136 11 12 15 12 963 26 258 3 73 0 12 05db9164 89ddfee8 c314b537 e88cbfb4 4cf72387 7e0ccccf 1c86e0eb 0b153874 a73ee510 e9c971a2 755e4a50 bc8b54c7 5978055e b28479f6 25753fb1 fadc3903 e5ba7672 5bb2ec8e 5b1d6ed9 b1252a9d 8a903c79 32c7478e 7cb5b4d7 e8b83407 ec01bf7b
0 1 34 29 2 10 2 1 2 2 1 1 2 05db9164 b80912da 7b467545 d0cbe447 0942e0a7 fbad5c96 fc8f52a9 0b153874 a73ee510 3b08e48b ad39ba86 dd94da95 751c7a99 b28479f6 79fcb5cb 169d489d e5ba7672 7119e567 3014a4b1 5840adea 23fcd679 3a171ecb de1e9c76 e8b83407 ce0bf6fc
1 1 1 22 1 138 7 16 22 114 1 8 0 7 7e5c2ff4 287130e0 67fa93b5 1fa34039 43b19349 13718bbd f828f7fb 0b153874 a73ee510 b883655e ab066900 2eb927aa 5d4198ed 07d13a8f 10040656 6f930046 e5ba7672 891589e7 21ddcdc9 5840adea fce0d6a4 3a171ecb 1793a828 e8b83407 63093459
0 2 8 4 4 2 4 8 6 55 2 5 0 4 05db9164 b80912da 02391f51 b9c629a9 b0530c50 7e0ccccf fd10f30e 0b153874 a73ee510 bfc44ba9 e3ee9d2e 2397259a 0d60a93e 07d13a8f ee76936d d37efe8c e5ba7672 30f25b5e 21ddcdc9 5840adea b6119319 423fab69 45ab94c8 ce62e669 b13f4ade
1 15 2 88 27 4 1 21 49 124 1 3 1 5a9ed9b0 4f25e98b aee80afd ae78390d 25c83c98 fbad5c96 f00bddf8 6c41e35e a73ee510 16a81a6c 55795b33 12d1b214 39795005 1adce6ef fb2772ea 121f992b e5ba7672 bc5a0ff7 dfc341f8 a458ea53 b4847d32 32c7478e e7bc1058 001f3601 6b208992
1 0 8 14551 26 2 0 22 2 0 87552397 80e26c9b 431913c5 85dd697c 25c83c98 fbad5c96 b46e01f1 0b153874 a73ee510 39cda501 7c53dc69 5798519c 4fd35e8f 07d13a8f e8f4b767 2d0bbe92 3486227d 005c6740 21ddcdc9 5840adea 91404954 3a171ecb 1793a828 e8b83407 b9809574
0 0 12 9 4430 21 2 11 11 1 9 05db9164 333137d9 22fbf56a b92573a3 25c83c98 fe6b92e5 ad9b2639 0b153874 a73ee510 9c4dd39e e4034ebf 878d3428 ea089f5d b28479f6 a46bf7c6 7401a802 07c540c4 c61e82d7 21ddcdc9 a458ea53 634363f7 c9d4222a 32c7478e a2752662 445bbe3b fc1f43e7
0 21 904 7 30 79 39 87 20 251 2 8 0 39 05db9164 f0cf0024 20009f96 73fec7fb 4cf72387 fbad5c96 a98972ab 0b153874 a73ee510 06363d2d a523f48a 57c08194 5cc21877 b28479f6 fdb1071f 054b386f 3486227d cc693e93 21ddcdc9 b1252a9d 0dd41d11 c9d4222a 32c7478e f9f7eb22 f0f449dd a3a8e8f4
1 1 1 9 35 5 6 17 10 912 1 9 6 05db9164 09e68b86 21f56260 7cc584ad 89ff5705 fbad5c96 69b885a7 5b392875 a73ee510 b6900243 208d9dd6 252752f5 59dd51b4 07d13a8f 36721ddc e20cfabe e5ba7672 5aed7436 db0b20dc b1252a9d 3572f92c 423fab69 869261fd f0f449dd fb52e815
0 0 3 47 10 1494 153 6 11 269 0 4 10 5a9ed9b0 39dfaa0d 86d9f7e6 77b5e5ed b2241560 7e0ccccf afa309bd 0b153874 a73ee510 c54560e0 77212bd7 04d776a9 7203f04e 07d13a8f 60fa10e5 465ae0d6 e5ba7672 df4fffb7 21ddcdc9 5840adea 8b9756be c9d4222a c7dc6720 c88bdcee 010f6491 4e7af834
0 1 0 44 24 4 24 6 43 232 1 4 24 05db9164 c44e8a72 93655629 1b9f91ce 25c83c98 fbad5c96 a25cceac 67b76963 a73ee510 0b16773a 5bee5497 f0f6a9c1 a57cffd3 1adce6ef d6c04afa 6dc8c52c e5ba7672 456d734d 05e4794e a458ea53 dc1b605a bcdee96c 79fc7b8a 724b04da 0cc1543a
1 18 0 37 20 28 20 18 19 20 1 1 0 20 05db9164 ad61f1c8 b64ac9a3 1df4d824 25c83c98 7e0ccccf ac2d4799 0b153874 a73ee510 da500e68 434d6c13 71d55d49 7301027a b28479f6 3403e98c ed6d847a e5ba7672 84eb7a34 1d0aeb7a ad3062eb c7dc6720 786a0db5
1 4 88 4 20 14 27 357 31 874 2 41 13 05db9164 0eb070fa e75647d9 50912373 43b19349 7e0ccccf 1c86e0eb 0b153874 a73ee510 e7ba2569 755e4a50 a2337f7c 5978055e 07d13a8f 733cd612 2873175e e5ba7672 7ba9340b b4c77ec9 32c7478e 55cf97a5
0 0 0 20 9 7441 13 4 9 12 0 1 9 05db9164 46320fff de0cea78 66d81227 25c83c98 604312d3 0b153874 a73ee510 3b08e48b 0f6f1a80 51f94b83 9077501d 07d13a8f 4b572351 3ec13e49 e5ba7672 d981a095 21ddcdc9 5840adea b1bb8218 32c7478e 4f272e57 c9f3bea7 25ae1dcc
0 1 1 7 2 331 62 2 5 72 1 2 0 2 05db9164 8947f767 59f8a22b 16e92bee 25c83c98 7e0ccccf b471ac4f 1f89b562 a73ee510 4e56c58e e1ba038b 92352c1e e65a5fc3 07d13a8f 2c14c412 57ac7fda e5ba7672 bd17c3da 4b367914 b1252a9d e68624bc 3a171ecb c77fdeda 010f6491 0a798839
1 25 16 11 11545 56 1 20 51 1 11 05db9164 8f5b4275 b009d929 c7043c4b 5a3e1872 fbad5c96 e76a087f 0b153874 a73ee510 3b08e48b 50349a3f 3563ab62 370eceb9 1adce6ef a6bf53df b688c8cc d4bb7bd8 65c9624a 21ddcdc9 5840adea 2754aaf1 93bad2c0 3b183c5c e8b83407 adb5d234
0 1 20 3 6 1 2 2 8 8 1 2 2 5a9ed9b0 e5fb1af3 77f9d96e bc87885b 25c83c98 3bf701e7 6772d022 0b153874 a73ee510 9f7517e0 e0c3cae0 4ce8091c e8df3343 1adce6ef 60403b20 8fb0be40 07c540c4 13145934 21ddcdc9 b1252a9d c3f827f4 423fab69 f0f123e9 c243e98b 63ef9236
0 -1 8 5 11535 32 0 7 13 0 5 5a9ed9b0 f8c8e8f8 74e1a23a 9a6888fb 25c83c98 fe6b92e5 93955fc0 1f89b562 a73ee510 7dab1649 5215184e fb8fab62 b8ae7766 07d13a8f d4696a42 c6b1e1b2 07c540c4 d2f0bce2 21ddcdc9 5840adea 99c09e97 3a171ecb 335a6a1e f55c04b6 68a2a837
0 0 1 1 1 3755 124 6 8 198 0 3 1 5a9ed9b0 9819deea 533b1a61 f922efad 25c83c98 fe6b92e5 a4bbd4f4 0b153874 a73ee510 3b76bfa9 8d5ad79c b99ddbc8 4809d853 b28479f6 1150f5ed 87acb535 e5ba7672 7e32f7a4 a4b7004c 93bad2c0 b34f3128
0 0 15 7 1 2652 57 5 40 55 0 1 1 8cf07265 8947f767 37722a24 8802788f 25c83c98 fda1a50f 0b153874 a73ee510 3b08e48b d2b7c44b e3caf087 68637c0d 64c94865 d120f347 42bc62e3 e5ba7672 bd17c3da 21ddcdc9 a458ea53 1891824e 32c7478e b7bf6986 010f6491 a6115607
0 5 176 1 1 627 61 109 17 118 2 11 1 05db9164 38a947a1 1646cf1d fcdc5174 25c83c98 fe6b92e5 6fa3c1a7 1f89b562 a73ee510 5f50c86b b8deab54 c30bbcd1 efbb2435 07d13a8f 927edf61 ffb61047 e5ba7672 e73433e0 122d6055 423fab69 d8e17d82
1 108 20 403 0 1 0 109 0 7 1 2 0 05db9164 942f9a8d 871b4299 25dd4760 4cf72387 7e0ccccf d70c05b1 7b6fecd5 a73ee510 7edea927 c4adf918 2f1be242 85dbe138 1adce6ef ae97ecc3 c637ec94 e5ba7672 1f868fdd 2e30f394 a458ea53 140ec002 ad3062eb bcdee96c b50e18f9 001f3601 f99af3bd
0 15 0 14 10 609 35 29 12 419 1 3 3 10 05db9164 09e68b86 c86b9e6a e4fd0a5b 25c83c98 7e0ccccf a90a99c5 0b153874 a73ee510 e6003298 e9561d8b 906b3727 1cc9ac51 b28479f6 6f73304a a10da4c7 8efede7f 479030a6 7a1c9aad 5840adea c06c3736 32c7478e 41be4766 e8b83407 d8a062c4
0 8 0 10 12 46 12 8 10 12 1 1 12 05db9164 b7ca2abd ee96fc95 68ad052c 25c83c98 7e0ccccf 968a6688 5b392875 a73ee510 e851ff7b f25fe7e9 ce875433 dd183b4c 64c94865 5f2d5a3a 5f92b84a e5ba7672 4771e483 95b757a6 3a171ecb 41be4766
0 0 5 6 2 3021 151 6 10 18 0 1 2 be589b51 207b2d81 d0484442 68637816 25c83c98 7e0ccccf 12c61956 45f7c2dd a73ee510 29e50671 94d2aad8 3b9ae062 f23a3825 07d13a8f 0c67c4ca 3a1a0a65 07c540c4 395856b0 21ddcdc9 a458ea53 1720a38e 32c7478e 4de83b96 001f3601 8f16a3b8
0 4 7954 19 2 6 17 1 68fd1e64 78ccd99e 0a1435c1 bdcfffba 25c83c98 7e0ccccf c4939891 0b153874 a73ee510 fbbf2c95 7d4bba07 5a276398 2fad1153 8ceecbc8 d5adea3d 4da40ea2 07c540c4 e7e991cb 21ddcdc9 5840adea 290c14f6 3a171ecb ded4aac9 2bf691b1 bdf46dce
1 7 89 14 3 2 2 47 31 341 2 10 0 2 05db9164 421b43cd ced9477f 29998ed1 25c83c98 7e0ccccf 6bf83cdb 0b153874 a73ee510 89ff09ee 60adb56e 6aaba33c 53b60829 b28479f6 2d0bb053 b041b04a e5ba7672 2804effd 723b4dfd dbb486d7 b34f3128
1 -1 27180 12 2 0 5 1 05db9164 46b01795 4cf72387 1dcabd2a 0b153874 a73ee510 1d56e466 9cf09d42 f66b043c 1adce6ef c830dc5e 07c540c4 e3a5430f 32c7478e
0 1 1 39 15 119 18 1 18 15 1 1 15 05db9164 4f25e98b 01fefe29 e86b1560 25c83c98 7e0ccccf 0038e65c 0b153874 a73ee510 3b08e48b 7e728ed1 4676ac97 1ddad6aa 1adce6ef 17d9b759 3581aa7f d4bb7bd8 7ef5affa 9437f62f b1252a9d 745c79e6 bcdee96c 3fdb382b 001f3601 49d68486
0 0 2 5 1284 0 23 24 0 5 05db9164 8084ee93 02cf9876 c18be181 0942e0a7 7e0ccccf 0b72a0e8 5b392875 a73ee510 3b08e48b 4950c85b 8fe001f4 1d27b635 b28479f6 16d2748c 36103458 776ce399 003d4f4f e587c466 bcdee96c 3b183c5c
1 0 74 36 4 36375 8 0 4 68fd1e64 0468d672 08266a1d a3fc4871 4cf72387 7e0ccccf 5fd3419b 37e4aa92 a73ee510 972359d0 f69fd509 692521c3 c7176043 b28479f6 234191d3 dc3c41ba d4bb7bd8 9880032b 21ddcdc9 5840adea 10738086 3a171ecb e43a3efc ea9a246c 4e7af834
1 4 5 8 35 1398 64 19 9 703 1 4 59 05db9164 2a69d406 30b6e3ea 13508380 4cf72387 7e0ccccf 579c293b 0b153874 a73ee510 b38bac58 f66047e5 4551eab3 13c89cc4 07d13a8f 3b2d8705 48f5ae81 e5ba7672 642f2610 55dd3565 b1252a9d de95351a c9d4222a 423fab69 45ab94c8 2bf691b1 c84c4aec
0 7 48 41035 3 05db9164 6e638bbc 49a1cd79 cca79e1e 25c83c98 fe6b92e5 8f4478fe 0b153874 a73ee510 8ba6af1c 1cd8b8ae 0acdf55c 86b6351d b28479f6 c11477f0 f541ee61 d4bb7bd8 f6a2fc70 21ddcdc9 b1252a9d 1afb7d8e bcdee96c 75cfed80 445bbe3b e2f05ce0
1 -1 14752 0 2 4 0 5bfa8ab5 38a947a1 e710f9eb ae6e2a08 25c83c98 fe6b92e5 56f361f1 0b153874 a73ee510 3b08e48b 6d91e005 d0649cfd 34098dd6 b28479f6 7160a164 6ffcab68 776ce399 82103027 9487db01 be7c41b4 f57138a8
0 210 6 2 9072 0 2 12 0 2 05db9164 a07503cc 5d260103 13508380 25c83c98 987da766 0b153874 a73ee510 a9271c40 f37be5c0 519590f0 a59ea816 07d13a8f 77660bba 884b33b5 e5ba7672 912c7e21 1d1eb838 b1252a9d 353846c9 c7dc6720 45ab94c8 445bbe3b c84c4aec
0 3 45 6 7 18 6 52 7 177 1 9 0 6 f5796c5b 80e26c9b 6e5bddab d3e92866 25c83c98 7e0ccccf 24e8ca9f 0b153874 a73ee510 5fd7dd92 94a1f0fa bf413137 153f0382 07d13a8f f3635baf af6fc4b8 3486227d f54016b9 21ddcdc9 5840adea a3405885 423fab69 b0fb6a50 e8b83407 61556511
0 0 38 2 3 11664 0 6 3 0 0 0 3 68fd1e64 2c16a946 849cf586 b180f466 25c83c98 7e0ccccf 5547e1f4 0b153874 a73ee510 5db9788f 087dfcfd 48fc0800 5317f239 07d13a8f 18231224 9fbd58f8 e5ba7672 74ef3502 51c0191c 3a171ecb 9117a34a
0 11 6 18 1632 0 19 21 0 19 5a9ed9b0 58e67aaf 381d8ea3 76bbce8c 25c83c98 7e0ccccf 9b7f373a 7b6fecd5 a73ee510 597e2a48 ec2b795a 732c8db2 a5975b1d 07d13a8f 10935a85 03f89a73 1e88c74f c21c3e4c 21ddcdc9 a458ea53 d83181ad c7dc6720 3fdb382b b9266ff0 25bf05c2
0 180 35 1 31780 0 1 1 0 1 8cf07265 421b43cd bc27bcef 29998ed1 f281d2a7 fbad5c96 1d94dd40 0b153874 a73ee510 efea433b ccfdca2f 6aaba33c d76cea6e b28479f6 e1ac77f7 b041b04a d4bb7bd8 2804effd 723b4dfd 32c7478e b34f3128
1 2 4 0 4 0 12 0 49 1 3 0 68fd1e64 38a947a1 cc9e717b 9ca2c15d 25c83c98 d5141a06 5b392875 a73ee510 af94b16c f2a5d7d2 37dfef2b a3b89afc b28479f6 a5118040 1cb7075e e5ba7672 b6b880ec 42dbeba8 32c7478e 88422d4d
1 -1 6223 2 22 0 20 3 68fd1e64 38a947a1 6847b3c1 6cd6e51f 25c83c98 fbad5c96 93ec533b f0298c3c a73ee510 3b08e48b 9ffb3655 eed4a04f a0874a81 1adce6ef 4a591230 d4ca38be e5ba7672 e3c6d69d ba703820 32c7478e c50d808e
1 3 153 3 3 1 0 4 4 13 1 2 0 05db9164 421b43cd 24146df6 29998ed1 25c83c98 7e0ccccf 4aa938fc 5b392875 a73ee510 451bd4e4 2b9c7071 6aaba33c 1aa94af3 b28479f6 e1ac77f7 b041b04a e5ba7672 2804effd 723b4dfd 3a171ecb b34f3128
0 4 45 41 31 5 11 156 32 185 1 25 0 11 68fd1e64 89ddfee8 9732b11b 4c0dcfee 25c83c98 fbad5c96 1c86e0eb 5b392875 a73ee510 e7ba2569 755e4a50 ccb8af7d 5978055e b28479f6 25753fb1 19637c17 e5ba7672 5bb2ec8e ae44ba4c b1252a9d 0db71b18 32c7478e 5c960292 f0f449dd 45b5a9e7
1 1 21 13 12 8 5 8 20 69 1 4 5 05db9164 e3db0bac 9cc6a4f1 9cd2a845 25c83c98 ab1ad103 0b153874 a73ee510 63c8d3d5 859b343f e68fa129 20819d96 07d13a8f 618b0ee5 3004a5f2 e5ba7672 a7ccaded 21ddcdc9 5840adea dc135e3f 8ec974f4 423fab69 08b0ce98 b9266ff0 b29c74dc
0 2 3 14 9 5 9 2 10 9 1 1 9 8c6ba407 09e68b86 b976df14 0b839026 25c83c98 fbad5c96 cc5ed2f1 5b392875 a73ee510 3b08e48b e216a695 ab02884f 9f16a973 b28479f6 52baadf5 5fa439a6 e5ba7672 5aed7436 2aa4575d b1252a9d 32dcf845 32c7478e f8d85724 e8b83407 f643b6c5
0 88 73 41 4420 0 46 47 0 46 05db9164 73a46ff0 c19a1e7a b7802d6b 25c83c98 fe6b92e5 28639f10 0b153874 a73ee510 3b08e48b 3a5bf2d6 0761d1a2 155ff7d9 b28479f6 4f648a87 079f48c0 776ce399 da507f45 21ddcdc9 b1252a9d a1fdd170 c9d4222a 3a171ecb a455dffb ea9a246c aa99435d
0 2644 4 1 26246 0 1 14 0 1 05db9164 80e26c9b 7df8ac19 42cc30a8 25c83c98 fbad5c96 d2d741ca 0b153874 a73ee510 3b08e48b ea4adb47 6cf704b2 05781932 1adce6ef 8ba8b39a dbdb2c16 e5ba7672 f54016b9 21ddcdc9 a458ea53 a92be8d2 c9d4222a 3a171ecb 3037ff6a e8b83407 b112057a
0 139 1 13556 79 1 13 59 1 0 1 68fd1e64 38a947a1 4fc317a6 6a14f9b9 25c83c98 fbad5c96 282b88fc 0b153874 a73ee510 0f1a2599 3e2feacf 9ff86c51 0e5bc979 07d13a8f 46df822a f8b34416 3486227d c9ac134a f3ddd519 32c7478e b34f3128
0 1 13 2 12026 535 8 26 308 3 3 05db9164 90081f33 36e97f3a e96617b3 25c83c98 fbad5c96 7f9907fe 5b392875 a73ee510 a3e2e7a5 a7b606c4 ba5aae2e eae197fd 64c94865 eec7af60 23b497d2 d4bb7bd8 ef981aa1 36a4f6c3 3a171ecb 3e022f4d
1 2 10 14 20 577 142 3 39 42 1 2 26 05db9164 08d6d899 9143c832 f56b7dd5 25c83c98 7e0ccccf dc7659bd 0b153874 a73ee510 efea433b e51ddf94 ae1bb660 3516f6e6 b28479f6 bfef54b3 bad5ee18 e5ba7672 87c6f83c 0429f84b 32c7478e c0d61a5c
1 0 45 6 1584 37 10 28 228 0 6 11 5a9ed9b0 bce95927 b46f1f1d 13508380 25c83c98 fbad5c96 737174dc 0b153874 a73ee510 3b08e48b 3b0a3499 35dfe2c5 c8e4b0c1 07d13a8f fec218c0 9720e154 e5ba7672 04d863d5 b7380686 b1252a9d 2b0e5756 c9d4222a 32c7478e 45ab94c8 e8b83407 c84c4aec
1 0 1214 4 20 2131 159 4 11 580 0 3 0 72 05db9164 4f25e98b 2d1ef417 68a5fcbb 4cf72387 7e0ccccf 5e64ce5f 0b153874 a73ee510 3ccfe0c0 4618e030 975c1c17 025225f2 b28479f6 8ab5b746 6720b72e 27c07bd6 7ef5affa 21ddcdc9 b1252a9d 722d167c 32c7478e 3fdb382b e8b83407 49d68486
0 0 3 4553 49 1 0 0 1 5a9ed9b0 38a947a1 a16966ab 65803e5f 43b19349 fbad5c96 3b16ebba 0b153874 a73ee510 8edcd037 6803595d fc0ad095 2a2faae1 b28479f6 b593a63b fd97a107 d4bb7bd8 1263c077 392cde4b 32c7478e af55e227
0 316 5 234 0 0 0 0 05db9164 38a947a1 3f5a37fe 1032bac8 25c83c98 7e0ccccf 1760a525 37e4aa92 a73ee510 3b08e48b 2d6f299a ce406f01 f0e0f335 b28479f6 77ef1e58 67f512fb 776ce399 b6b880ec c2b62b88 be7c41b4 c86755ff
1 2040 14 54675 0 2 6 0 da4eff0f 09e68b86 5b8662c6 5bad2804 25c83c98 8c28e5b5 6a698541 7cc72ec2 feccf887 ae4c531b 8ee18973 01c2bbc7 b28479f6 52baadf5 d93ba614 e5ba7672 5aed7436 75916440 a458ea53 2554eed2 32c7478e 47577e42 e8b83407 89fa8140
0 0 0 15 6 1512 18 15 10 215 0 6 6 05db9164 09e68b86 aa8c1539 85dd697c 43b19349 7e0ccccf af84702c c8ddd494 a73ee510 fa7d0797 ae19a197 d8c29807 7f0d7407 b28479f6 2d49999f c64d548f e5ba7672 63cdbb21 cf99e5de 5840adea 5f957280 3a171ecb 1793a828 e8b83407 b7d9c3bc
0 39 9 9 3814 82 1 9 82 1 9 68fd1e64 421b43cd 3983c24c 29998ed1 4cf72387 fe6b92e5 dcc1b63d 1f89b562 a73ee510 d04aae7d 731cd88c 6aaba33c 34d253f7 b28479f6 2d0bb053 b041b04a d4bb7bd8 2804effd 723b4dfd 3a171ecb b34f3128
0 0 32 13 35317 0 15 30 0 13 5a9ed9b0 09e68b86 39cbb726 afc54bd9 25c83c98 13718bbd d2d741ca 5b392875 a73ee510 3b08e48b ea4adb47 4f5c5791 05781932 07d13a8f 36721ddc 2f6bcbc0 d4bb7bd8 5aed7436 2442feac a458ea53 b215bc2d 3a171ecb 1793a828 e8b83407 02fa3dea
0 45 11 15 40 44 1 15 44 1 15 64e77ae7 38d50e09 92eb3174 88e439d9 25c83c98 6f6d9be8 fc6b47d9 5b392875 a73ee510 5080de78 b3410e99 604f499b 0d2cad4c 07d13a8f e2275836 8e662061 d4bb7bd8 fffe2a63 21ddcdc9 b1252a9d 872c22d6 32c7478e df487a73 001f3601 c27f155b
1 1122 41211 499 0 0 10 0 05db9164 207b2d81 d0484442 68637816 f281d2a7 12c61956 0b153874 a73ee510 48af2ba2 94d2aad8 3b9ae062 f23a3825 07d13a8f 0c67c4ca 3a1a0a65 d4bb7bd8 395856b0 21ddcdc9 a458ea53 1720a38e 32c7478e 4de83b96 001f3601 8f16a3b8
1 1 -1 696 1 22 1 81 1 7 0 68fd1e64 537e899b 5037b88e 9dde01fd 25c83c98 7e0ccccf 17024f49 f504a6f4 a73ee510 f2a8242b ba0f9e8a 680d7261 4e4dd817 07d13a8f 6d68e99c c0673b44 e5ba7672 b34aa802 e049c839 c7dc6720 6095f986
0 18 3 1480 340 9 3 26 2 0 3 05db9164 a796837e 08de7b18 97ce69e9 30903e74 7e0ccccf 12343fcc 0b153874 a73ee510 547c0ffe 9bcaeafe c5011072 46f42a63 cfef1c29 98eddd86 5a9431f3 27c07bd6 e90118d1 e754c5e1 3a171ecb 8fc66e78
0 2 59 3 3 11 3 2 3 3 1 1 3 05db9164 09e68b86 27685115 a35ea34f 25c83c98 7e0ccccf 9b4ad590 1f89b562 a73ee510 3b08e48b 75b8e15e 92e9af0d ed43e458 1adce6ef dbc5e126 dc52e604 07c540c4 5aed7436 21ddcdc9 5840adea e5835dfb bcdee96c f89ffef1 e8b83407 a9637a08
0 0 -1 5937 29 1 1 60 0 1 05db9164 09e68b86 d49019a8 8d5aa295 43b19349 13718bbd 89391314 0b153874 a73ee510 9372d502 608452cc 615e62e7 cbb8fa8b b28479f6 52baadf5 e606c6b3 e5ba7672 5aed7436 2b558521 b1252a9d 7440d805 32c7478e 18038694 e8b83407 7048bfb1
1 0 0 2 2875 245 2 2 243 0 2 0 05db9164 86d4fccc 697f4e85 f2159098 4cf72387 fbad5c96 dc7659bd 5b392875 a73ee510 efea433b e51ddf94 35641a0a 3516f6e6 07d13a8f e87e1df4 c1eba210 e5ba7672 e727949e 21ddcdc9 5840adea 47e2c032 32c7478e 3b183c5c 001f3601 afd260f5
0 1 0 70 6 135 27 14 2 45 1 2 0 6 68fd1e64 80e26c9b ba1947d0 85dd697c 4cf72387 16a2e9cb 1f89b562 a73ee510 1ce1e29d 44fa9a7f 34a238e0 f27ed3ab 1adce6ef 0f942372 da441c7e e5ba7672 005c6740 21ddcdc9 5840adea 8717ea07 423fab69 1793a828 e8b83407 9904c656
1 80 25 2 3 2 3 80 3 3 1 1 1 3 05db9164 0b8e9caf 9b9cd1bb 5974d6bc 25c83c98 fbad5c96 4b815add 0b153874 a73ee510 3b08e48b 7cb56051 7364e701 1ac91ec9 b28479f6 5340cb84 1ab2aab4 3486227d ca6a63cf 91311aa2 bcdee96c 08b0ce98
0 0 1 1 1801 14 0 0 05db9164 5dac953d d032c263 c18be181 384874ce 7e0ccccf 8363bee7 0b153874 a73ee510 efea433b bf09be0e dfbb09fb 3516f6e6 1adce6ef 32330105 84898b2a e5ba7672 24de59c1 0014c32a 32c7478e 3b183c5c
0 1 0 6 8 18 8 1 8 8 1 1 8 5a9ed9b0 0468d672 c48cd8f8 24d89f30 25c83c98 24a360aa 5b392875 a73ee510 c8a342b9 2c9174a6 f25a8037 7eda22c5 b28479f6 234191d3 9ca51d92 d4bb7bd8 9880032b 21ddcdc9 5840adea 17b90ef0 32c7478e da89b7d5 ea9a246c 984e0db0
1 7 1 5 1 1311 58 50 2 200 1 6 0 1 05db9164 6887a43c 9b792af9 9c6d05a0 25c83c98 7e0ccccf f367d44f 0b153874 a73ee510 3e3375c9 f68c5128 6532318c d86616b0 1adce6ef ef6b7bdf 2c9d222f 3486227d 8f0f692f 21ddcdc9 a458ea53 cc6a9262 ad3062eb 423fab69 a5862ce8 445bbe3b 0b89ae9f
1 1 0 1 378 41 4 16 100 1 2 68fd1e64 38a947a1 75df6d36 b1c1e580 25c83c98 7e0ccccf 14ad5567 1f89b562 a73ee510 9dc8b302 9ddd72e9 6fbed051 37e99bb7 07d13a8f 6d74487d f10a7996 07c540c4 b3e92443 c576dc74 3a171ecb 67d37917
0 183 3 3 27395 0 3 67 0 3 be589b51 f3139f76 1c8c8a04 bf0b19a8 30903e74 7e0ccccf 6d389dca 0b153874 a73ee510 98bd7a24 e4eb05d4 5b5ab0a8 a4c5d6dd b28479f6 28c50c84 5131d930 e5ba7672 df5475ca 3b226dea 3a171ecb 4fcc135f
0 1 17 3 0 7 3 0 3 05db9164 083aa75b 88bd9da3 c235950d 25c83c98 7e0ccccf 0697a6a6 0b153874 7cc72ec2 3b08e48b 7fb7db93 f3ba84a1 208257bb 1adce6ef 84203dfc 30129ae3 2005abd1 06747363 21ddcdc9 b1252a9d 9ad721d6 be7c41b4 993d6982 f0f449dd 7eaed4be
0 6 7 2 3003 0 42 8 0 0 9 241546e0 a796837e 42db3232 e3cc371a 25c83c98 7e0ccccf 11ffbf5b 37e4aa92 a73ee510 7ad4ea2c f2313205 c9669737 9c7a975e cfef1c29 f0bf9094 c4de5bba 8efede7f 1cdbd1c5 288eaded ad3062eb 3a171ecb 8fc66e78
0 1 36771 112 1 0 77 1 05db9164 f3139f76 9d3adacf 28d926b8 43b19349 fe6b92e5 0cd2f08f 0b153874 a73ee510 3b08e48b 7592da6b 7b93a4c7 18f84563 b28479f6 28c50c84 fc53f85c d4bb7bd8 df5475ca ed35ed93 32c7478e 4fcc135f
0 20 1 1 4841 20 3 5 16 2 1 68fd1e64 38d50e09 948ee031 b7ab56a2 4cf72387 fbad5c96 7d733ece 0b153874 a73ee510 3753b9eb 30b2a438 42bee2f2 aebdb575 b28479f6 06373944 67b3c631 07c540c4 fffe2a63 21ddcdc9 b1252a9d bd074856 32c7478e df487a73 001f3601 c27f155b
1 7 1095 3 37 1 7 3 3 2 2 1 05db9164 85af3139 d032c263 c18be181 384874ce fe6b92e5 7195046d 1f89b562 a73ee510 f1b45aab 4d8549da dfbb09fb 51b97b8f b28479f6 af8db00e 84898b2a e5ba7672 d4328054 0014c32a bcdee96c 3b183c5c
1 0 0 19 7 2193 41 9 18 199 0 4 0 9 05db9164 ef69887a 7007f08d f6131df0 4cf72387 7e0ccccf e8fc728b 0b153874 a73ee510 603ff749 e7ce7f20 2d936711 f522015f 07d13a8f b98be2c0 1c332795 e5ba7672 4bcc9449 abfaf938 a458ea53 caad4ae9 32c7478e 3fdb382b e8b83407 49d68486
1 0 0 1 1 7571 57 19 1 16 0 7 0 1 05db9164 38a947a1 72e5eac0 eee0e446 25c83c98 fbad5c96 66a728c4 0b153874 a73ee510 d0ff5b05 dab547a5 673768e2 7aab7990 07d13a8f 613de492 d617f1ff 3486227d 7abb2837 72a8c407 ad3062eb 423fab69 375c3609
0 156 2 25905 0 11 39 0 2 05db9164 08d6d899 9143c832 f56b7dd5 25c83c98 7e0ccccf 8ce3a35f 0b153874 a73ee510 3b08e48b c8e7f509 ae1bb660 6e8ef725 b28479f6 bffbd637 bad5ee18 776ce399 bbf70d82 0429f84b 32c7478e c0d61a5c
0 0 102404 0 9a89b36c 38a947a1 b89c82b4 c10a6e59 25c83c98 7e0ccccf 04679a14 0b153874 7cc72ec2 975342c2 19a2ded8 15820680 90c7f9d1 64c94865 fd056e92 911ebe1c 07c540c4 b2e570f5 00cfee60 ad3062eb 3a171ecb 4904c5a1
1 46 614 210 0 10 0 71 0 257 1 5 4 0 5a9ed9b0 942f9a8d d61e0f0a c2fcecf6 4cf72387 7e0ccccf 3f4ec687 45f7c2dd a73ee510 0e9ead52 c4adf918 f6f14c38 85dbe138 07d13a8f a8e962af 64c4c290 27c07bd6 1f868fdd 21ddcdc9 b1252a9d 06316f4c ad3062eb 32c7478e 38be899f e8b83407 9bef54fd
1 0 9 2 1576 29 3 4 14 0 1 05db9164 6887a43c bce3f26f 1d8a14d0 43b19349 fe6b92e5 675e81f6 0b153874 a73ee510 a5bb26cf 4a77ddca 381dd9fd dc1d72e4 64c94865 004dd4ed c26ce5c1 1e88c74f 36a1d942 21ddcdc9 b1252a9d e22e102f c9d4222a 32c7478e 47c5aea3 445bbe3b 12d4e9a4
0 -1 101295 0 05db9164 2ae0a573 b7810abb 65b2bfc7 25c83c98 fe6b92e5 ccbac4d9 0b153874 7cc72ec2 3b08e48b c012107d 82665b78 c8dca410 07d13a8f 413cc8c6 6399ea39 07c540c4 f2fc99b1 ea03ca8b ad3062eb be7c41b4 d91ea8bd
0 -1 0 0 32 0 87552397 a8b6b751 25c83c98 7e0ccccf d9aa9d97 5b392875 7cc72ec2 3b08e48b 6e647667 85dbe138 b28479f6 694e45e3 2005abd1 d787f192 21ddcdc9 5840adea 32c7478e 001f3601 99f4f64c
\ No newline at end of file
0 0 5 4 13275 14 35 4 41 4 0 4 05db9164 f0cf0024 6f67f7e5 41274cd7 25c83c98 fbad5c96 25c8362c 0b153874 a73ee510 0e97bb27 ba0f9e8a 623049e6 4e4dd817 b28479f6 e6c5b5cd c92f3b61 3486227d b04e4670 21ddcdc9 b1252a9d 60f6221e 32c7478e 43f13e8b ea9a246c 731c3655
1 0 559 2 7 2532 164 98 6 943 0 18 0 7 68fd1e64 bc478804 b96e826a 13508380 43b19349 7e0ccccf 8363bee7 0b153874 a73ee510 f322117a bf09be0e f53c5949 3516f6e6 07d13a8f 0af7c64c 170db6b2 e5ba7672 65a2ac26 21ddcdc9 b1252a9d f0ce5c73 c7dc6720 45ab94c8 001f3601 c84c4aec
0 2 8 3 9040 38 5 11 104 2 3 05db9164 e5fb1af3 2c003e73 6eaa3680 25c83c98 7e0ccccf 860f347d 1f89b562 a73ee510 4b8a7639 9f0003f4 0962e10a 5afd9e51 f862f261 2a079683 59c31b64 e5ba7672 13145934 21ddcdc9 a458ea53 24a384ae bcdee96c f11826cf 3a6f6b59 25cb8912
0 72 2 2 0 4 12 0 2 8cf07265 b0660259 31567fba 1a1efaf8 25c83c98 fbad5c96 88002ee1 0b153874 7cc72ec2 3b08e48b f1b78ab4 b6d5a886 6e5da64f 1adce6ef bd5431ee 7b977dd1 2005abd1 8ec3405f f0474b68 ad3062eb 32c7478e 53c37c32
0 2 6 34 16 1051 49 4 48 101 1 2 16 5a9ed9b0 80e26c9b 09275b26 f2ee08c0 25c83c98 7e0ccccf 372a0c4c 0b153874 a73ee510 a08eee5a ec88dd34 4e99cf84 94881fc3 1adce6ef 91f5e393 6bc40863 e5ba7672 ce25450e 21ddcdc9 b1252a9d 5dc70c60 423fab69 1793a828 e8b83407 91116abe
0 2 13 2 15757 54 5 2 15 1 0 2 05db9164 09e68b86 eb76bef2 804f7741 43b19349 13718bbd cc5ed2f1 0b153874 a73ee510 3b08e48b facf05cc f282fc98 9f16a973 b28479f6 52baadf5 eb62e551 07c540c4 5aed7436 c361c1be b1252a9d be7ab5d2 32c7478e 1793a828 e8b83407 e9938fed
0 0 14 1572 8 4 6 8 0 1 05db9164 e112a9de 9db30a48 b3dbc908 4cf72387 fbad5c96 f2530a89 0b153874 a73ee510 671ae88f 2181d913 2598d8eb 1e750733 ad1cc976 f1e1df0a 9ab4d6b1 e5ba7672 fdbdefe6 bbf96cac c3dc6cef 8f079aa5
0 0 195 14 5941 285 6 20 200 2 20 5bfa8ab5 80e26c9b 36984eba 85dd697c 4cf72387 fbad5c96 f6619575 1f89b562 a73ee510 73be1da8 d5cf9352 db02a7b5 09e3bbd5 07d13a8f e8f4b767 2d0bbe92 e5ba7672 005c6740 21ddcdc9 b1252a9d 6e55e022 ad3062eb 3a171ecb 1793a828 e8b83407 9904c656
0 0 52 1 1 4240 9 5 3 49 0 4 2 5a9ed9b0 b961056b 2fe61b6b 3642dc05 4cf72387 fe6b92e5 81bb0302 062b5529 a73ee510 8b7e21f6 b7094596 4ab3cda1 1f9d2c38 b28479f6 7eaf5074 a4b0914f e5ba7672 5742e45c 789fddf7 32c7478e 3b047130
1 1 1 1 1378 46 5 34 236 3 1 05db9164 38a947a1 a50fea16 0a8cd7bc 25c83c98 a601d936 0b153874 a73ee510 3fb38a44 348e21cb 2b2be35d 1d8cfec6 b28479f6 66106852 31cf393e e5ba7672 0458e647 58d08d44 32c7478e 355b6af8
0 0 0 5 2 5512 49 15 3 114 0 2 2 8cf07265 78ccd99e 01d1b993 20bb14e7 4cf72387 fbad5c96 a1eeac3d 0b153874 a73ee510 5f49e872 2e9d5aa6 500d0b9a 0a9ac04c 07d13a8f 162f3329 f24599ab e5ba7672 e7e991cb 4b1019ff a458ea53 b49094cd 423fab69 dc73316d fd2fe0bd 60a86ddf
1 2 -1 2 1 50 1 73 4 127 1 14 0 1 68fd1e64 2aee75a8 32c8cb11 c04614ba 25c83c98 3bf701e7 407438c8 0b153874 a73ee510 213889cd 755e4a50 fa20173a 5978055e 32813e21 6aa1d799 de53b24a 3486227d ad19d8d8 64c766b8 3a171ecb 747559ec
0 83 4 5 5666 14 1 5 14 1 5 05db9164 85af3139 d032c263 c18be181 25c83c98 fbad5c96 7195046d 0b153874 a73ee510 686e97b9 4d8549da dfbb09fb 51b97b8f b28479f6 af8db00e 84898b2a d4bb7bd8 d4328054 0014c32a ad3062eb bcdee96c 3b183c5c
0 3 11 1612 0 40 91 0 42 05db9164 537e899b 5037b88e 9dde01fd 25c83c98 3bf701e7 ac07b602 0b153874 a73ee510 3b08e48b 7ce882d2 680d7261 f5ff33d9 1adce6ef c535a0ec c0673b44 776ce399 b34aa802 e049c839 423fab69 6095f986
0 6 52 5 400098 0 15 15 0 5 5a9ed9b0 38d50e09 d032c263 c18be181 384874ce 7e0ccccf 6cd97108 0b153874 7cc72ec2 3b08e48b 62cdafdf dfbb09fb 2e551bbe 1adce6ef e2c18d5a 84898b2a 776ce399 582152eb 21ddcdc9 5840adea 0014c32a be7c41b4 3b183c5c 001f3601 99f4f64c
0 26 12 11 34669 531 1 12 27 1 0 11 05db9164 98159f6d 3cc4baf5 7b110c65 25c83c98 fe6b92e5 c03eb803 0b153874 a73ee510 3b08e48b d700703a 169e9533 bccbbffe b28479f6 b2db654e 16c48bd2 3486227d 4854928e 114ff696 3a171ecb 3599e91f
0 7 6 13416 0 0 45 0 05db9164 247a1a11 896e7bb3 c2fcecf6 25c83c98 fbad5c96 c31847f5 0b153874 a73ee510 3b08e48b a12fca95 7fa9c0a1 9b9e44d2 07d13a8f 2559d9b6 ef01918c 776ce399 51360aab 5cc5adb2 c9d4222a be7c41b4 38be899f
0 1 5 7 14509 60 5 7 56 1 7 75ac2fe6 3e25b403 7c7b6098 f00503da 25c83c98 fe6b92e5 ef0d76b7 51d76abe a73ee510 82bb4986 529e8447 c1b3491a 50a56f08 07d13a8f ae1edc05 ab50786f e5ba7672 1c381aea f6801a20 c7dc6720 1793a828
0 0 2 30 10 1363 415 20 28 561 0 5 0 10 68fd1e64 95e2d337 8d85271d 69040d07 25c83c98 7e0ccccf 3603d925 0b153874 a73ee510 0065486b 7934c105 8b7685bd 4840c1ab 64c94865 7de4908b b1f23afa e5ba7672 701d695d 712d530c a458ea53 da0adeef c9d4222a 423fab69 4921c033 2bf691b1 80b0aeb9
1 1 6 3 12 0 4 40 31 410 1 14 4 68fd1e64 38a947a1 8962afa9 28625509 25c83c98 7e0ccccf 5fbd9170 0b153874 a73ee510 dc9f749b 2bcfb78f 662d25fe e6fc496d 07d13a8f 022e018a 2dd4e74f e5ba7672 f5508183 c5cea7f6 32c7478e 38255568
0 0 0 11 4 8657 213 6 3 210 0 1 4 05db9164 80e26c9b 0bd844ef aae30d38 25c83c98 7e0ccccf d2d741ca 0b153874 a73ee510 18139a78 ea4adb47 38a37d81 05781932 07d13a8f 856b2bc1 00c7a1bf 07c540c4 fdf644e0 21ddcdc9 a458ea53 45d05ca3 dbb486d7 3e1eed85 e8b83407 6a4b2388
0 47 35575 159 3 0 10 1 68fd1e64 999aae35 79bc99b4 e5e453f3 4cf72387 7e0ccccf c88e8d4f 0b153874 a73ee510 3b08e48b a21d2994 424e28fe 2e94d3f7 243a4e68 39a6addf a6a69939 07c540c4 63aa00dd 424af181 3a171ecb 869caea3
1 1 512 1 2 11 2 1 2 2 1 1 2 05db9164 b26462db 9d1d0933 ebdba02b f281d2a7 fbad5c96 12343fcc 0b153874 a73ee510 f6f942d1 7f8ffe57 c6a076d2 46f42a63 64c94865 f93f84eb d9e8fb80 d4bb7bd8 195c811d 306c202e 3a171ecb 340d03c3
0 -1 28922 24 1 8 22 1 05db9164 c8687797 5c7d8ff6 902872c9 4cf72387 fbad5c96 3833f734 0b153874 a73ee510 3b08e48b c05bd0b8 79b87c55 e411c4db b28479f6 dc96c4b0 5627d7e0 d4bb7bd8 a7e06874 21ddcdc9 b1252a9d 4063500f ad3062eb be7c41b4 54baf4d1 010f6491 ba676e3c
0 1 51 17212 0 1 3 0 5a9ed9b0 4f25e98b b5044e29 a311963e 307e775a fe6b92e5 fe4dce68 a6d156f4 a73ee510 75542289 68357db6 1290fbf4 768f6658 07d13a8f dfab705f d5a1b8fe 1e88c74f 7ef5affa 2b558521 b1252a9d b5074db5 c9d4222a 32c7478e c832486f 001f3601 f0353f67
0 0 162 4253 26 4 3 5 0 1 05db9164 b961056b 502bedec 81b1d519 384874ce fe6b92e5 c52b5f8e 5b392875 a73ee510 8b349795 419d31d4 e23a52b4 08961fd0 1adce6ef addd37ac b4df7a81 e5ba7672 43de85d3 fdb27279 423fab69 71640730
0 67 43 18 61 0 18 18 0 18 05db9164 38d50e09 c4205697 bbc8d361 25c83c98 fe6b92e5 165cb289 5b392875 a73ee510 3b08e48b b94c0f2d d8c2300e b9fa764b b28479f6 7501d6be bf300501 776ce399 f855e3f0 21ddcdc9 5840adea b59344cd 3a171ecb 17f458f7 001f3601 984e0db0
0 51 18 1 19 93 23 111 22 1156 2 11 0 23 287e684f a796837e 42db3232 e3cc371a 25c83c98 fe6b92e5 ff493eb4 25239412 a73ee510 efea433b 0983d89c c9669737 1aa94af3 cfef1c29 0d054fb9 c4de5bba e5ba7672 70e5bba7 288eaded 32c7478e 8fc66e78
0 0 84 43 11 198 75 14 27 76 0 2 1 11 05db9164 4f25e98b 23edf366 e4889f1e 25c83c98 7e0ccccf ac28d9ec 0b153874 a73ee510 6cb0e696 bc0819f7 92107e36 c9059ff0 cfef1c29 dddd963f 4e447cf7 3486227d 7ef5affa 55dd3565 a458ea53 9a91ae21 c9d4222a 32c7478e 54a607b7 001f3601 d568f27d
0 5 40 15 23322 746 7 15 524 3 15 05db9164 dda1caf9 f83418e0 a44d75e2 25c83c98 7e0ccccf 7d09e065 0b153874 a73ee510 3b08e48b bf2008fa a9165671 c9ae71af 07d13a8f 24c5daaf 839572dd e5ba7672 65cebfa5 cd746367 3a171ecb a9a2ac1a
0 4 1 0 8cf07265 38c81d1a 27388f4d 539558b1 25c83c98 ce8217f8 0b153874 7cc72ec2 3b08e48b 9d12ce9b cc83e10f 9dfda2b9 b28479f6 558590b3 ed956dff 2005abd1 a5ac4b1e 21ddcdc9 b1252a9d 1061dd07 be7c41b4 a79557ea b9266ff0 6ddc02f9
1 18 3 1 4233 3 17 1 118 5 1 5a9ed9b0 78ccd99e ced2e736 13508380 25c83c98 fbad5c96 c8b3d034 0b153874 a73ee510 3275d09a 80da9312 c7fe806a d14c9212 07d13a8f 162f3329 d274b433 e5ba7672 e7e991cb 55dd3565 b1252a9d b46cb608 c7dc6720 45ab94c8 e8b83407 c84c4aec
0 0 2788 7 1451 0 1 0 0 0 1 1464facd 8947f767 64a350ad 5e369129 25c83c98 fe6b92e5 a13be9ad 0b153874 a73ee510 4e56c58e 62aedd5c 70aaa25e e65a5fc3 b28479f6 a473257f 9bb1dfa5 d4bb7bd8 bd17c3da 083e89d9 b1252a9d d3a891c1 ad3062eb 3a171ecb b6b5bc47 010f6491 c4510344
0 1 6 1 4402 22 1 11 22 1 1 05db9164 207b2d81 d52980aa b66d15e3 25c83c98 fbad5c96 6ce84868 1f89b562 a73ee510 3b08e48b 609032c1 b519c595 437a58d6 b28479f6 3c767806 7c8ae841 07c540c4 395856b0 21ddcdc9 b1252a9d 605305ee 32c7478e f090fae7 001f3601 6024c307
0 125 2 14 7259 30 2 14 97 2 14 8cf07265 04e09220 b1ecc6c4 5dff9b29 4cf72387 7e0ccccf 543f351f 1f89b562 a73ee510 3b08e48b be8a7bc2 2436ff75 7d1f1fa0 07d13a8f cae64906 f4ead43c d4bb7bd8 e161d23a 4f1aa25f ad3062eb 3a171ecb ded4aac9
0 610 1 1 7526 40 2 1 12 1 1 5a9ed9b0 207b2d81 8a48553d 1e10bd9f 25c83c98 fe6b92e5 12343fcc 0b153874 a73ee510 547c0ffe bc8c9f21 6803e296 46f42a63 64c94865 11b2ae92 ff48ade9 e5ba7672 395856b0 21ddcdc9 b1252a9d c3d093fb ad3062eb 3a171ecb 84a27184 001f3601 8d2deb5a
0 0 1 59 3 2766 96 2 4 7 0 1 3 5a9ed9b0 38a947a1 cf1b3029 36b520dc 4cf72387 7e0ccccf 5e64ce5f 0b153874 a73ee510 d4a82fb9 8b94178b 0d74ab27 025225f2 b28479f6 77ef1e58 5dcf110f 07c540c4 b6b880ec dd70b3ec 32c7478e 8f282db5
0 34 18859 106 26 0 17 1 0 05db9164 c1c79489 66fa4409 bdc253c8 5a3e1872 fbad5c96 8d51595c 0b153874 a73ee510 216f775a 7110c233 7e4627d4 bb7a2c12 32813e21 59b212e4 5f231427 e5ba7672 7549f127 50798fce c7dc6720 0f9697f0
1 1 321 1 1189 8 16 11 96 1 3 1 05db9164 a796837e 5c05f1ab 97ce69e9 25c83c98 fe6b92e5 81b62616 0b153874 a73ee510 06ee81ba fa1b06e6 50ec33a6 0eb69562 07d13a8f 47a431f5 5a9431f3 e5ba7672 f1a8f10f e9672021 ad3062eb 423fab69 8fc66e78
0 2 1 16 13 326 61 3 47 263 1 1 0 55 8cf07265 8947f767 999b4cd3 f862f65d 25c83c98 7e0ccccf 9e8dab66 0b153874 a73ee510 fbbf2c95 46febd4d ea486dc7 949ea585 07d13a8f 2c14c412 e51f35a7 e5ba7672 bd17c3da 83236299 b1252a9d 19bea55f 32c7478e 75aae369 010f6491 08e0e995
0 0 9 8 7182 255 9 24 44 4 8 05db9164 38a947a1 7a2ffaba 8dcfa982 25c83c98 7e0ccccf c519c54d 0b153874 a73ee510 19fd5a0e 59cd5ae7 842e9873 8b216f7b b28479f6 67596d53 1e8e1075 e5ba7672 3fb55a52 8b5b9b68 32c7478e 10edf4e4
0 107 1 1 7 489 1 1 31 1 1 24eda356 2c16a946 adf23330 17a25a2e 25c83c98 7e0ccccf 12343fcc 0b153874 a73ee510 f6f942d1 7f8ffe57 8a390857 46f42a63 b28479f6 3628a186 64f2ada9 07c540c4 e4ca448c 467f6a77 3a171ecb 9117a34a
0 0 280 20 10 4512 51 5 11 97 0 1 0 10 68fd1e64 6c713117 f6f030bc 19d6ddb8 25c83c98 fe6b92e5 7c59aadb 5b392875 a73ee510 c5a978c5 ff78732c 7bea4a04 9b656adc b28479f6 73b98472 7afa5706 3486227d bf6b118a 21ddcdc9 b1252a9d aef05b30 c7dc6720 1caea946 445bbe3b 69a06689
0 41 6 2 359699 3 2 87552397 4f25e98b 16958dc8 8dfe2376 25c83c98 7e0ccccf 8025502e 0b153874 7cc72ec2 b118f931 29e4ad33 ea50fad8 80467802 b28479f6 8ab5b746 0e4c86f8 d4bb7bd8 7ef5affa 5b885066 b1252a9d e103da3e 3a171ecb fcb2509d 001f3601 24488670
1 1 13 7 11 1 4 4 18 44 1 3 4 05db9164 09e68b86 aa8c1539 85dd697c 25c83c98 41e1828d 0b153874 a73ee510 3b08e48b b6358cf2 d8c29807 61c65daf 8ceecbc8 d2f03b75 c64d548f 07c540c4 63cdbb21 cf99e5de 5840adea 5f957280 32c7478e 1793a828 e8b83407 b7d9c3bc
1 2 25 2 38 2 2 2 2 1 1 2 5a9ed9b0 207b2d81 fb47f7d0 6c02aa53 4cf72387 7e0ccccf 6fb62f1a 0b153874 a73ee510 4f6357b0 e51ddf94 d9fc673a 3516f6e6 b28479f6 0739b998 7e14b290 07c540c4 934abd6e 21ddcdc9 b1252a9d 0a47a519 ad3062eb 32c7478e 47620345 001f3601 c36f2d3c
0 11 2 3 92 2 36 0 21 1 4 f473b8dc 80e26c9b 1c791144 51d55e9c 384874ce fbad5c96 57b4bd89 0b153874 a73ee510 3b08e48b 71fd20d9 b49c9404 ddd66ce1 1adce6ef 8ba8b39a 2cbed9f7 e5ba7672 f54016b9 21ddcdc9 5840adea 80f3703a 3a171ecb 1793a828 e8b83407 dbd4e512
1 0 1 1 1 5715 181 23 2 169 0 4 0 1 5a9ed9b0 7182b361 b2aa5dce 462749d8 43b19349 7de93965 37e4aa92 a73ee510 28c6ef79 9ba53fcc 05ce35fd 42156eb4 07d13a8f 47367e94 1a5c540a 3486227d ec9b0866 437ad2af c9d4222a c7dc6720 73338ee2
0 54 62 12 7578 184 7 12 72 1 24 05db9164 38a947a1 bc2aea05 ac975db6 25c83c98 13718bbd 80162d04 5b392875 a73ee510 3b08e48b 5b97686e b67ac327 47727147 07d13a8f 22223d6c d388d33c e5ba7672 97b81540 0ac4575d 32c7478e d28d80ac
0 6 0 12 6 185 37 6 7 37 1 1 37 05db9164 09e68b86 9596aa6c b26d2eda 4cf72387 7e0ccccf 1a95b4d0 0b153874 a73ee510 995f172b 507605d4 bb2b1b19 5f3a0c1b 07d13a8f 36721ddc 872b1c96 e5ba7672 5aed7436 338f20de b1252a9d 8f7b9fe2 32c7478e cad46f36 e8b83407 58a43195
0 104 1 1 2679 0 8 18 0 1 9684fd4d 8dbd550a 4cf72387 7e0ccccf 8cf87048 c8ddd494 a73ee510 3b08e48b a12fca95 9b9e44d2 f862f261 b13d160c 776ce399 53d8aa6f be7c41b4
0 3384 5 2803 151 13 11 150 1 11 65aada8c 537e899b 5037b88e 9dde01fd 25c83c98 fbad5c96 7bcc368f 062b5529 a73ee510 f26b2389 60d2afd7 680d7261 155ff7d9 07d13a8f 73c54e3e c0673b44 e5ba7672 86b4fc22 e049c839 3a171ecb 6095f986
1 -1 5967 11 4 1 10 2 0 68fd1e64 510b40a5 d03e7c24 eb1fd928 25c83c98 ac902434 062b5529 a73ee510 e5da7278 1294fec1 951fe4a9 7bbf93ce 07d13a8f 67daf98c 8ec71479 e5ba7672 03364bd3 0e63fca0 32c7478e 0e8fe315
0 78 7 10 0 7 7 0 7 05db9164 9f7e1d07 e3818eb2 a4456f7e 25c83c98 02d72eea 5b392875 a73ee510 c9e11adf e09c447b 3bec5d45 8dab0422 b28479f6 08812651 72d1790f 1e88c74f 6a58e423 21ddcdc9 5840adea 950d91c1 32c7478e 2f7e98de ea9a246c e7ecb821
1 -1 1509 0 4 0 23 3 05db9164 cc8e236e 1c239854 cf3dc9c2 4cf72387 fe6b92e5 81bb0302 0b153874 a73ee510 983552b8 b7094596 98d78b2b 1f9d2c38 07d13a8f 3a8c68b7 da1333b6 e5ba7672 775e80fe 21ddcdc9 5840adea 3ee29a07 ad3062eb c7dc6720 c83e0347 ea9a246c 2fede552
1 2 -1 550 3 155 8 30 2 16 0 ae82ea21 3f0d3f28 c2b2b3f5 77a160bd f281d2a7 fbad5c96 3625ff87 6c41e35e a73ee510 67eea4ef 755e4a50 db21b797 5978055e 32813e21 e8d4033b fae7560f e5ba7672 744ad4a0 a17a10b3 3a171ecb e5fca70a
1 5 113 6 18 0 10 9 21 21 2 3 3 0 05db9164 6887a43c 1e361e58 825b2615 43b19349 fbad5c96 6d0ca8d7 0017bc7c a73ee510 666a1d31 6939835e 9b62c79b dc1d72e4 b28479f6 9cc57c4d fd420402 27c07bd6 2ae4f30d 21ddcdc9 b1252a9d d12542f8 32c7478e 488d4283 445bbe3b d20e4b7a
1 3 21 2 2 1 0 6 2 13 1 3 0 68fd1e64 38a947a1 756e3a77 bd47cb50 25c83c98 fe6b92e5 09e42cac 5b392875 a73ee510 79877583 30b2a438 74a6216c aebdb575 b28479f6 b3547943 9e33c845 e5ba7672 04fdc63f 77cd58fc 3a171ecb 69e7316d
0 0 1 5 4 9947 275 8 12 865 0 3 4 05db9164 6887a43c 9b792af9 9c6d05a0 25c83c98 84c427f0 0b153874 a73ee510 9bc1a7c1 41b3f655 6532318c ce5114a2 8ceecbc8 4e06592a 2c9d222f e5ba7672 8f0f692f 21ddcdc9 b1252a9d cc6a9262 32c7478e a5862ce8 445bbe3b c4c8f547
0 0 21 6 6 7270 275 3 6 93 0 2 6 05db9164 2c16a946 f7ef15ea 6ad68ce1 25c83c98 7e0ccccf 5ff926ae 25239412 a73ee510 4497acf2 864d33c2 bfe72c91 34786fb9 b28479f6 3628a186 2d08259c 07c540c4 e4ca448c 96739728 ad3062eb 32c7478e 9117a34a
1 5 1 3 2 5 0 48 3 2 2 13 0 05db9164 2efdbb44 88f1ca30 aece8ab6 25c83c98 3bf701e7 1c86e0eb 1f89b562 a73ee510 f7ab55a0 755e4a50 4cf7f85a 5978055e 32813e21 ff824c52 5a58ab6d e5ba7672 42076ccd 3991fb63 55dd3565 4721fd29
0 0 1 4 1 7025 101 13 1 39 0 1 1 1 05db9164 207b2d81 d52980aa b66d15e3 25c83c98 7e0ccccf f6d03c1b 5b392875 a73ee510 fe687d88 30b2a438 b519c595 aebdb575 07d13a8f 0c67c4ca 7c8ae841 e5ba7672 395856b0 21ddcdc9 b1252a9d 605305ee 32c7478e f090fae7 001f3601 77e5b96c
0 1 95 28 8 67 14 103 42 267 1 23 14 05db9164 89ddfee8 d8f59a85 f1d06e8a 25c83c98 7e0ccccf 1c86e0eb 5b392875 a73ee510 213889cd 755e4a50 64f3690c 5978055e 1adce6ef 34cce7d2 eeb76d70 e5ba7672 5bb2ec8e 0053530c b1252a9d 7f6bcbee ad3062eb 32c7478e 43fe299c f0f449dd f3b1f00d
0 4 54 2 9 1349 23 52 22 90 1 8 0 11 be589b51 38a947a1 e28faa26 f44af879 25c83c98 fbad5c96 407438c8 0b153874 a73ee510 5df036eb 755e4a50 defeb71b 5978055e 07d13a8f 92b9a831 d4aed6bf 27c07bd6 2a870f7f fecb5e8c c9d4222a 32c7478e a9313cb6
1 3 145 6 12 2 8 6 36 1 4 2 be589b51 0c0567c2 9424724f fa30ea43 25c83c98 fe6b92e5 52b4e012 0b153874 a73ee510 5ba3608f a739bbee f400be52 79128231 b28479f6 1e82594c efbacdc0 e5ba7672 98c4d3e0 cd86ac29 78e2e389 32c7478e 7fb4ff91
0 38 2 1 12810 8 3 1 7 1 1 05db9164 af447d7a fc39fe56 3197d543 25c83c98 7e0ccccf 9aba5215 5b392875 a73ee510 46c32c26 8cfaeec1 ebf6ae0a ef800ef3 b28479f6 f0d27586 38fc4d35 07c540c4 98ff11f4 11e4edec 32c7478e 0ff91809
1 1 0 4 11 1019 11 4 20 91 1 2 0 11 05db9164 09e68b86 b1ffdff4 aff068bc 25c83c98 b87f4a4a 5b392875 a73ee510 e70742b0 319687c9 ee94532f 62036f49 cfef1c29 18847041 17442b68 e5ba7672 5aed7436 21ddcdc9 5840adea acf8bce8 32c7478e 1793a828 e8b83407 63093459
1 3 15 24 3288 0 3 278 0 25 05db9164 38a947a1 4470baf4 8c8a4c47 25c83c98 7e0ccccf 0dbf2675 0b153874 a73ee510 48a94b2e 88196a93 bb669e25 1211c647 b28479f6 59621a99 2b2ce127 e5ba7672 b133fcd4 2b796e4a 32c7478e 8d365d3b
1 0 175 59061 539 0 0 56 0 f473b8dc 1cfdf714 43b964ee f089159e 25c83c98 3bf701e7 c86e8c6b 37e4aa92 7cc72ec2 eab78bab 4d8549da 031b2b3a 51b97b8f 051219e6 af56328b e162466d e5ba7672 e88ffc9d 5b885066 a458ea53 9f7d1d43 ad3062eb 3a171ecb b15e807d e8b83407 fcefd6a4
1 39 2 8 5 1202 22 42 12 179 1 2 1 13 05db9164 942f9a8d 8658d326 2b884b66 4cf72387 7e0ccccf 3f4ec687 0b153874 a73ee510 0e9ead52 c4adf918 cf9c76af 85dbe138 07d13a8f a8e962af 12ff41b8 3486227d 1f868fdd 1d04f4a4 a458ea53 9e55b62d ad3062eb 32c7478e 3fdb382b 9d93af03 49d68486
0 -1 24422 11 18 0 8 1 05db9164 9e5ce894 02391f51 b9c629a9 25c83c98 3bf701e7 22fd2464 0b153874 a73ee510 5aca218f d9085127 2397259a ef7e2c01 07d13a8f 8cf98699 d37efe8c e5ba7672 a5bb7b8a 21ddcdc9 5840adea b6119319 32c7478e 45ab94c8 ea9a246c b13f4ade
0 1 8 4936 0 0 15 0 05db9164 ea3a5818 e33cc329 7a5aa046 25c83c98 fbad5c96 4a45f6c5 0b153874 a73ee510 fe01516c 2a0b79f8 aaa493f6 25512dff b28479f6 0a069322 ba35244c e5ba7672 a1d0cc4f 21ddcdc9 a458ea53 c1a3607e c7dc6720 a7084d70 1575c75f ef4df1dd
0 124 1 2648 0 4 13 0 05db9164 38a947a1 7b9e7a93 49afffac 25c83c98 7e0ccccf bddc9773 0b153874 a73ee510 3b08e48b ff2333c8 5f12b145 140595a0 b28479f6 7c5bcff3 e699400f d4bb7bd8 876521e0 6d6ae2d8 32c7478e b258af68
1 4 -1 282 18 4 15 15 1 1 68fd1e64 38a947a1 840eeb3a f7263320 25c83c98 7e0ccccf 44fb02c7 6c41e35e a73ee510 3b08e48b 2386466b 317bfd7d 45db6793 07d13a8f 6f1ab4eb 1689e4de e5ba7672 5d961bca dc55d6df 3a171ecb aa0115d2
1 -1 14834 111 6 0 204 2 68fd1e64 08d6d899 6a8a1217 14bfebf4 25c83c98 7e0ccccf 9e0ed189 0b153874 a73ee510 f68bc089 c3e44774 5b355b50 c278016c 64c94865 a8e4fe6e 0ded9094 e5ba7672 9dde83ca 831d5286 32c7478e 9e9a60e4
0 3 276 8 11 10 11 3 26 11 1 1 0 11 5e53cc38 b26462db b6025941 06b1cf6e 4cf72387 13718bbd 65ae2219 0b153874 a73ee510 fbbf2c95 447a6784 72e65cea 9be66b48 cfef1c29 fc8350a5 25b075e4 07c540c4 35ee3e9e ad6ee353 3a171ecb 0ff91809
0 -1 11615 30 1 0 28 1 5a9ed9b0 95e2d337 086df0da 6262590b 4cf72387 7e0ccccf 72cf945c 0b153874 a73ee510 ef2fbb20 7b61aa9b 547c3f98 7f5bf282 07d13a8f 4e505ea3 7ac9f411 d4bb7bd8 7b06fafe 21ddcdc9 a458ea53 29ac833e 32c7478e 7c28ef9f 2bf691b1 b288bc0b
0 0 8 306565 0 14 8 0 10 05db9164 38a947a1 4470baf4 8c8a4c47 25c83c98 7e0ccccf 2e85de94 0b153874 7cc72ec2 3b08e48b 8d6d03a0 bb669e25 86c652c6 b28479f6 091737ad 2b2ce127 776ce399 ade68c22 2b796e4a ad3062eb be7c41b4 8d365d3b
0 0 1992 2 1451 31 1 20 31 0 1 2 be589b51 d833535f b00d1501 d16679b9 25c83c98 fbad5c96 9a75d128 0b153874 a73ee510 3b08e48b 90bf7fef e0d76380 a70d1580 b28479f6 a733d362 1203a270 e5ba7672 281769c2 73d06dde c9d4222a 32c7478e aee52b6f
0 2 1 8 13233 164 13 8 88 7 0 8 05db9164 13f25995 0b0f3952 35d9e6fe 25c83c98 7e0ccccf 87e29668 0b153874 a73ee510 3b08e48b 0bc0e6ed ff8c6fd9 abd69a9d 07d13a8f 7cad642c 5015d391 8efede7f c7cf2414 3db17de9 32c7478e 4fe18e82
0 50 15 6661 0 40 49 0 21 5bfa8ab5 08d6d899 77f2f2e5 d16679b9 25c83c98 7e0ccccf 7f2c5a6e a61cc0ef a73ee510 3b08e48b d21494f8 9f32b866 f47f13e4 b28479f6 bffbd637 31ca40b6 1e88c74f bbf70d82 dfcfc3fa c9d4222a 32c7478e aee52b6f
1 0 13 2 10320 72 0 2 45 0 2 05db9164 09e68b86 e95580ff 653ee14f 25c83c98 fe6b92e5 26a81064 5b392875 a73ee510 dcbc7c2b 9e511730 49a381fa 04e4a7e0 1adce6ef dbc5e126 cf6ed269 d4bb7bd8 5aed7436 21ddcdc9 a458ea53 c9fcf5fd 3a171ecb 5e22c595 e8b83407 8e27cf04
0 -1 11066 0 0 1 0 05db9164 38a947a1 a64c7bd9 67a8407c 25c83c98 fe6b92e5 71fd6dcd 0b153874 a73ee510 3b08e48b e5cd3d61 0a8d756f 08ba5c35 b28479f6 b7815e37 ef7d43b0 776ce399 a6bfeb0a 455f53fb 93bad2c0 928e948f
0 107 8939 0 1 2 0 05db9164 a244fe99 25c83c98 7e0ccccf c8e48a82 0b153874 a73ee510 c6c8dd7c ae4c531b 01c2bbc7 07d13a8f 2f5df569 d4bb7bd8 35901cfb ad3062eb 423fab69
0 0 2 15 12 2504 365 1 10 77 0 1 12 68fd1e64 08d6d899 03942b3f afe92929 25c83c98 7e0ccccf f4b9d7ad 0b153874 a73ee510 663eefea c1ee56d0 e977ae2f ebd756bd 07d13a8f 1a277242 82f06a35 d4bb7bd8 87c6f83c 08119c8b 55dd3565 f96a556f
1 13 2153 1 25 37 3 13 9 29 2 2 3 68fd1e64 4f25e98b de211a17 a8925441 25c83c98 5e64ce5f 1f89b562 a73ee510 be630248 8b94178b fcaae253 025225f2 b28479f6 8ab5b746 3b58b07a e5ba7672 7ef5affa 9437f62f b1252a9d ce247dc1 32c7478e 3fdb382b 001f3601 0fd820a6
1 37 72 2 3 4 2 49 42 222 1 5 2 05db9164 3f0d3f28 d73310fa b40012b1 4cf72387 fbad5c96 ad3508b1 0b153874 a73ee510 08658f3b ad757a5a 0e466d8f 93b18cb5 32813e21 3440b690 f4219d4b e5ba7672 7da064fc 0471db05 ad3062eb c7dc6720 e5fca70a
0 0 5 11541 0 0 7 0 05db9164 89ddfee8 15d7420a ff441594 25c83c98 7e0ccccf bdaf7920 0b153874 a73ee510 fbbf2c95 4c074d2a 5f27bc59 f948ca5d 051219e6 d5223973 e2b64862 1e88c74f 5bb2ec8e 0053530c a458ea53 2f4978df 32c7478e 75c8ca05 f0f449dd d21d0b82
0 15 2 2 87297 0 3 23 0 3 05db9164 a8b6b751 3e67fbbb 10056215 25c83c98 7e0ccccf d9aa9d97 5b392875 7cc72ec2 3b08e48b c4adf918 d9f32d8d 85dbe138 b28479f6 694e45e3 345db5a2 776ce399 d787f192 21ddcdc9 5840adea 7463465b ad3062eb 32c7478e 3d236c54 001f3601 984e0db0
0 30 1 12 5 11 5 608 19 286 1 47 1 5 05db9164 89ddfee8 ab2fe4c8 428cff52 43b19349 3bf701e7 407438c8 1f89b562 a73ee510 0a164266 755e4a50 3989acff 5978055e b28479f6 25753fb1 cf445916 8efede7f 5bb2ec8e 21ddcdc9 b1252a9d d64ee25a 78e2e389 32c7478e 0b351a52 e8b83407 b1c17344
1 5 7 2 2 414 21 83 33 925 1 36 2 68fd1e64 421b43cd 06ded108 29998ed1 43b19349 7e0ccccf 4aa938fc 5b392875 a73ee510 03ed27e7 2b9c7071 6aaba33c 1aa94af3 b28479f6 2d0bb053 b041b04a e5ba7672 2804effd 723b4dfd c9d4222a 3a171ecb b34f3128
0 1 6 21905 0 15 49 0 6 05db9164 62e9e9bf 91c52fd6 89085a81 43b19349 fe6b92e5 e88f1cec 45f7c2dd a73ee510 3b08e48b 8f410860 5ad710aa b8eec0b1 cfef1c29 9a7936cb 9decb3fe 776ce399 d2651d6e c7d10c5e be7c41b4 6f90ebe1
0 0 174 5 14718 10 0 5 5a9ed9b0 2fe85f57 b61789da 230aba50 25c83c98 fe6b92e5 3a6d4c08 0b153874 a73ee510 d108fc83 41656eae 24604d0c 66815d59 07d13a8f d8524628 78d9f0d0 e5ba7672 f4373605 ab303097 c9d4222a 32c7478e fab2a151
0 0 7 1 15780 12 6 1 1 1 1 05db9164 8ab240be cedcacac 7967fcf5 25c83c98 7e0ccccf 5f29da0e 0b153874 a73ee510 f476fbe3 0ad37b4b 553e02c3 f9d99d81 1adce6ef 28883800 91a6eec5 1e88c74f ca533012 21ddcdc9 5840adea a97b62ca 423fab69 727a7cc7 445bbe3b 6935065e
0 0 2 1 1540 44 4 4 268 0 4 5 05db9164 68b3edbf 77f2f2e5 d16679b9 25c83c98 7e0ccccf fcf0132a 1f89b562 a73ee510 aed3d80e d650f1bd 9f32b866 863f8f8a b28479f6 f511c49f 31ca40b6 e5ba7672 752d8b8a dfcfc3fa c7dc6720 aee52b6f
0 7 31 1 239 1 8 9 49 1 2 0 1 68fd1e64 8084ee93 d032c263 c18be181 43b19349 fe6b92e5 cee47266 0b153874 a73ee510 14781fa9 87fe3e10 dfbb09fb 3bd6c21d b28479f6 16d2748c 84898b2a 27c07bd6 003d4f4f 0014c32a 32c7478e 3b183c5c
0 -1 12674 4 26 0 73 2 05db9164 09e68b86 eecaacb9 d268ac84 25c83c98 13718bbd 33cca6fa 0b153874 a73ee510 401ced54 683e14e9 ce76d69d 2b9fb512 b28479f6 52baadf5 7bf10350 e5ba7672 5aed7436 55dd3565 b1252a9d 3d7cfd1b 3a171ecb 3fdb382b 3d2bedd7 49d68486
0 259 4 103468 0 0 14 0 05db9164 8947f767 d8ec4c68 ac1667dd 4cf72387 7e0ccccf 3527bb7c 0b153874 7cc72ec2 3b08e48b 2b9f131d 2a63b3ee aca10c14 07d13a8f 2c14c412 11b43c2e 8efede7f bd17c3da 21ddcdc9 a458ea53 79a05ba5 32c7478e 4fb9fee0 010f6491 004f1180
1 3 145 4 108 6 4 4 31 1 2 4 8cf07265 6c2cbbdc a42bd759 8b3b6b2e 25c83c98 f00bddf8 062b5529 a73ee510 0d538fca 55795b33 6bb7b021 39795005 64c94865 af094307 c3815fe3 e5ba7672 fb299884 987d0b7a 32c7478e 145ae095
1 147 1 159966 0 1 1 0 1 68fd1e64 38d50e09 c86b2d8d 657dc3b9 25c83c98 7e0ccccf bc324536 1f89b562 7cc72ec2 474773a7 2bcfb78f 1ca7a526 e6fc496d b28479f6 06373944 ba46c3a1 e5ba7672 fffe2a63 21ddcdc9 b1252a9d eb0fc6f8 ad3062eb 32c7478e df487a73 001f3601 c27f155b
\ No newline at end of file
......@@ -25,48 +25,16 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input(self):
def sparse_inputs():
ids = envs.get_global_env("hyper_parameters.sparse_inputs_slots", None, self._namespace)
sparse_input_ids = [
fluid.layers.data(name="S" + str(i),
shape=[1],
lod_level=1,
dtype="int64") for i in range(1, ids)
]
return sparse_input_ids
def dense_input():
dim = envs.get_global_env("hyper_parameters.dense_input_dim", None, self._namespace)
dense_input_var = fluid.layers.data(name="D",
shape=[dim],
dtype="float32")
return dense_input_var
def label_input():
label = fluid.layers.data(name="click", shape=[1], dtype="int64")
return label
self.sparse_inputs = sparse_inputs()
self.dense_input = dense_input()
self.label_input = label_input()
self._data_var.append(self.dense_input)
for input in self.sparse_inputs:
self._data_var.append(input)
self._data_var.append(self.label_input)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
self.sparse_inputs = self._sparse_data_var[1:]
self.dense_input = self._dense_data_var[0]
self.label_input = self._sparse_data_var[0]
def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
def embedding_layer(input):
emb = fluid.layers.embedding(
......@@ -76,25 +44,27 @@ class Model(ModelBase):
size=[sparse_feature_number, sparse_feature_dim],
param_attr=fluid.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()),
)
emb_sum = fluid.layers.sequence_pool(
input=emb, pool_type='sum')
initializer=fluid.initializer.Uniform()), )
emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb_sum
def fc(input, output_size):
output = fluid.layers.fc(
input=input, size=output_size,
act='relu', param_attr=fluid.ParamAttr(
input=input,
size=output_size,
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(input.shape[1]))))
return output
sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1)
concated = fluid.layers.concat(
sparse_embed_seq + [self.dense_input], axis=1)
fcs = [concated]
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace)
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
self._namespace)
for size in hidden_layers:
fcs.append(fc(fcs[-1], size))
......@@ -109,29 +79,33 @@ class Model(ModelBase):
self.predict = predict
def avg_loss(self):
cost = fluid.layers.cross_entropy(input=self.predict, label=self.label_input)
cost = fluid.layers.cross_entropy(
input=self.predict, label=self.label_input)
avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost
def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
label=self.label_input,
num_thresholds=2 ** 12,
num_thresholds=2**12,
slide_steps=20)
self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc
def train_net(self):
self.model._init_slots()
self.input()
self.net()
self.avg_loss()
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
def infer_net(self):
self.model._init_slots()
self.input()
self.net()
......@@ -59,6 +59,13 @@
## 使用教程
### 数据处理
参考每个模型目录数据下载&预处理脚本
```
sh run.sh
```
数据读取默认使用core/reader.py
### 训练
```
python -m paddlerec.run -m paddlerec.models.rank.dnn # 以DNN为例
......
......@@ -22,8 +22,9 @@ train:
reader:
batch_size: 2
class: "{workspace}/reader.py"
train_data_path: "{workspace}/data/train_data"
train_data_path: "{workspace}/data/slot_train_data"
sparse_slots: "label"
dense_slots: "wide_input:8 deep_input:58"
model:
models: "{workspace}/model.py"
......
mkdir train_data
mkdir test_data
mkdir data
train_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.data"
test_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.test"
train_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/train_data/train_data.csv"
test_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/test_data/test_data.csv"
train_path="adult.data"
test_path="adult.test"
train_data_path="./train_data/train_data.csv"
test_data_path="./test_data/test_data.csv"
#pip install -r requirements.txt
pip install -r requirements.txt
#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
python data_preparation.py --train_path ${train_path} \
--test_path ${test_path} \
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import io
import args
import pandas as pd
from sklearn import preprocessing
def _clean_file(source_path, target_path):
"""makes changes to match the CSV format."""
with io.open(source_path, 'r') as temp_eval_file:
with io.open(target_path, 'w') as eval_file:
for line in temp_eval_file:
line = line.strip()
line = line.replace(', ', ',')
if not line or ',' not in line:
continue
if line[-1] == '.':
line = line[:-1]
line += '\n'
eval_file.write(line)
def build_model_columns(train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
# Load the dataset in Pandas
train_df = pd.read_csv(
train_data_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
test_df = pd.read_csv(
test_data_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
categorical_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation'
]
for col in categorical_columns:
label_train = preprocessing.LabelEncoder()
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col] = label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(),
bins,
labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(),
bins,
labels=False)
base_columns = [
'education', 'marital_status', 'relationship', 'workclass',
'occupation', 'age_buckets'
]
train_df['education_occupation'] = train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df[
'age_buckets'].astype(str) + '_' + train_df['education'].astype(
str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df[
'age_buckets'].astype(str) + '_' + test_df['education'].astype(
str) + '_' + test_df['occupation'].astype(str)
crossed_columns = [
'education_occupation', 'age_buckets_education_occupation'
]
for col in crossed_columns:
label_train = preprocessing.LabelEncoder()
train_df[col] = label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col] = label_test.fit_transform(test_df[col])
wide_columns = base_columns + crossed_columns
train_df_temp = pd.get_dummies(
train_df[categorical_columns], columns=categorical_columns)
test_df_temp = pd.get_dummies(
test_df[categorical_columns], columns=categorical_columns)
train_df = train_df.join(train_df_temp)
test_df = test_df.join(test_df_temp)
deep_columns = list(train_df_temp.columns) + [
'age', 'education_num', 'capital_gain', 'capital_loss',
'hours_per_week'
]
train_df['label'] = train_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(
lambda x: 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
with io.open('test_data/columns.txt', 'w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(
deep_columns)) + '\n'
f.write(write_str)
f.close()
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
train_data_path, index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
test_data_path, index=False)
def clean_file(train_path, test_path, train_data_path, test_data_path):
_clean_file(train_path, train_data_path)
_clean_file(test_path, test_data_path)
if __name__ == '__main__':
args = args.parse_args()
clean_file(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
build_model_columns(args.train_data_path, args.test_data_path)
......@@ -11,18 +11,26 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
from paddlerec.core.reader import Reader
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
class TrainReader(Reader):
def init(self):
pass
......@@ -41,6 +49,20 @@ class TrainReader(Reader):
def data_iter():
wide_feat, deep_deat, label = self._process_line(line)
yield [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]
s = ""
for i in [('wide_input', wide_feat), ('deep_input', deep_deat),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
sh create_data.sh
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
......@@ -25,27 +25,27 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def wide_part(self, data):
out = fluid.layers.fc(input=data,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
out = fluid.layers.fc(
input=data,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
act=None,
name='wide')
return out
def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0,
scale=1.0 / math.sqrt(
data.shape[
1]))),
act=active,
name=tag)
output = fluid.layers.fc(
input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
act=active,
name=tag)
return output
......@@ -57,52 +57,71 @@ class Model(ModelBase):
return l3
def train_net(self):
wide_input = fluid.data(name='wide_input', shape=[None, 8], dtype='float32')
deep_input = fluid.data(name='deep_input', shape=[None, 58], dtype='float32')
label = fluid.data(name='label', shape=[None, 1], dtype='float32')
self._data_var.append(wide_input)
self._data_var.append(deep_input)
self._data_var.append(label)
hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units", 75, self._namespace)
hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units", 50, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", 25, self._namespace)
self.model._init_slots()
wide_input = self._dense_data_var[0]
deep_input = self._dense_data_var[1]
label = self._sparse_data_var[0]
hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units",
75, self._namespace)
hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units",
50, self._namespace)
hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units",
25, self._namespace)
wide_output = self.wide_part(wide_input)
deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, hidden3_units)
wide_model = fluid.layers.fc(input=wide_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(input=deep_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)),
act=None,
name='w_deep')
deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units,
hidden3_units)
wide_model = fluid.layers.fc(
input=wide_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
act=None,
name='w_wide')
deep_model = fluid.layers.fc(
input=deep_output,
size=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0)),
act=None,
name='w_deep')
prediction = fluid.layers.elementwise_add(wide_model, deep_model)
pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction")
pred = fluid.layers.sigmoid(
fluid.layers.clip(
prediction, min=-15.0, max=15.0),
name="prediction")
num_seqs = fluid.layers.create_tensor(dtype='int64')
acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=label, dtype='int64'), total=num_seqs)
auc_var, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=label, dtype='int64'))
acc = fluid.layers.accuracy(
input=pred,
label=fluid.layers.cast(
x=label, dtype='int64'),
total=num_seqs)
auc_var, batch_auc, auc_states = fluid.layers.auc(
input=pred, label=fluid.layers.cast(
x=label, dtype='int64'))
self._metrics["AUC"] = auc_var
self._metrics["BATCH_AUC"] = batch_auc
self._metrics["ACC"] = acc
cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=label)
cost = fluid.layers.sigmoid_cross_entropy_with_logits(
x=prediction, label=fluid.layers.cast(
label, dtype='float32'))
avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
def infer_net(self, parameter_list):
self.model._init_slots()
self.deepfm_net()
......@@ -22,8 +22,9 @@ train:
reader:
batch_size: 2
class: "{workspace}/criteo_reader.py"
train_data_path: "{workspace}/data/train_data"
train_data_path: "{workspace}/data/slot_train_data"
sparse_slots: "label feat_idx"
dense_slots: "feat_value:39"
model:
models: "{workspace}/model.py"
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import sys
......
......@@ -12,17 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
from paddlerec.core.reader import Reader
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
class TrainReader(Reader):
def init(self):
pass
......@@ -39,7 +47,20 @@ class TrainReader(Reader):
def generate_sample(self, line):
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label',
label)]
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
python download.py
mkdir -p slot_train_data/tr
for i in `ls ./train_data/tr`
do
cat train_data/tr/$i | python get_slot_data.py > slot_train_data/tr/$i
done
mkdir slot_test_data/ev
for i in `ls ./test_data/ev`
do
cat test_data/ev/$i | python get_slot_data.py > slot_test_data/ev/$i
done
......@@ -28,17 +28,22 @@ class Model(ModelBase):
loc=0.0, scale=init_value_)
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
# ------------------------- network input --------------------------
num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace)
raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field], dtype='int64')
raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32')
self.label = fluid.data(name='label', shape=[None, 1], dtype='float32') # None * 1
feat_idx = fluid.layers.reshape(raw_feat_idx, [-1, 1]) # (None * num_field) * 1
feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
num_field = envs.get_global_env("hyper_parameters.num_field", None,
self._namespace)
raw_feat_idx = self._sparse_data_var[1]
raw_feat_value = self._dense_data_var[0]
self.label = self._sparse_data_var[0]
feat_idx = raw_feat_idx
feat_value = fluid.layers.reshape(
raw_feat_value, [-1, num_field, 1]) # None * num_field * 1
feat_embeddings = fluid.embedding(
input=feat_idx,
......@@ -47,20 +52,11 @@ class Model(ModelBase):
size=[sparse_feature_number + 1, sparse_feature_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(initializer=initer))
feat_embeddings = fluid.layers.reshape(
feat_embeddings,
[-1, num_field, sparse_feature_dim]) # None * num_field * embedding_size
feat_embeddings = fluid.layers.reshape(feat_embeddings, [
-1, num_field, sparse_feature_dim
]) # None * num_field * embedding_size
feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size
# ------------------------- set _data_var --------------------------
self._data_var.append(raw_feat_idx)
self._data_var.append(raw_feat_value)
self._data_var.append(self.label)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
# -------------------- linear --------------------
weights_linear = fluid.embedding(
......@@ -81,7 +77,8 @@ class Model(ModelBase):
# -------------------- CIN --------------------
layer_sizes_cin = envs.get_global_env("hyper_parameters.layer_sizes_cin", None, self._namespace)
layer_sizes_cin = envs.get_global_env(
"hyper_parameters.layer_sizes_cin", None, self._namespace)
Xs = [feat_embeddings]
last_s = num_field
for s in layer_sizes_cin:
......@@ -92,7 +89,8 @@ class Model(ModelBase):
1]) # None, embedding_size, num_field, 1
X_k = fluid.layers.reshape(
fluid.layers.transpose(Xs[-1], [0, 2, 1]),
[-1, sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s
[-1, sparse_feature_dim, 1,
last_s]) # None, embedding_size, 1, last_s
Z_k_1 = fluid.layers.matmul(
X_0, X_k) # None, embedding_size, num_field, last_s
......@@ -132,16 +130,19 @@ class Model(ModelBase):
# -------------------- DNN --------------------
layer_sizes_dnn = envs.get_global_env("hyper_parameters.layer_sizes_dnn", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None, self._namespace)
layer_sizes_dnn = envs.get_global_env(
"hyper_parameters.layer_sizes_dnn", None, self._namespace)
act = envs.get_global_env("hyper_parameters.act", None,
self._namespace)
y_dnn = fluid.layers.reshape(feat_embeddings,
[-1, num_field * sparse_feature_dim])
for s in layer_sizes_dnn:
y_dnn = fluid.layers.fc(input=y_dnn,
size=s,
act=act,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(
input=y_dnn,
size=s,
act=act,
param_attr=fluid.ParamAttr(initializer=initer),
bias_attr=None)
y_dnn = fluid.layers.fc(input=y_dnn,
size=1,
act=None,
......@@ -153,9 +154,13 @@ class Model(ModelBase):
self.predict = fluid.layers.sigmoid(y_linear + y_cin + y_dnn)
def train_net(self):
self.model._init_slots()
self.xdeepfm_net()
cost = fluid.layers.log_loss(input=self.predict, label=self.label, epsilon=0.0000001)
cost = fluid.layers.log_loss(
input=self.predict,
label=fluid.layers.cast(self.label, "float32"),
epsilon=0.0000001)
batch_cost = fluid.layers.reduce_mean(cost)
self._cost = batch_cost
......@@ -169,9 +174,11 @@ class Model(ModelBase):
self._metrics["BATCH_AUC"] = batch_auc_var
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True)
return optimizer
def infer_net(self, parameter_list):
self.model._init_slots()
self.xdeepfm_net()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -31,5 +31,3 @@ mv diginetica/train.txt train_data
mkdir test_data
mv diginetica/test.txt test_data
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.input = []
self.length = None
......@@ -34,7 +35,8 @@ class EvaluateReader(Reader):
with open(f, "r") as fin:
for line in fin:
line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])]))
res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res
def make_data(self, cur_batch, batch_size):
......@@ -75,10 +77,8 @@ class EvaluateReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append(
[[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)])
......@@ -101,10 +101,13 @@ class EvaluateReader(Reader):
def _reader():
random.shuffle(self.input)
group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size])
for bg_id in range(0, self.length - group_remain,
batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True)
cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size)
......
......@@ -30,15 +30,21 @@ class Model(ModelBase):
def init_config(self):
self._fetch_interval = 1
self.items_num, self.ins_num = self.config_read(
envs.get_global_env("hyper_parameters.config_path", None, self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader")
self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace)
envs.get_global_env("hyper_parameters.config_path", None,
self._namespace))
self.train_batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.evaluate_batch_size = envs.get_global_env("batch_size", None,
"evaluate.reader")
self.hidden_size = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
self.step = envs.get_global_env(
"hyper_parameters.gnn_propogation_steps", None, self._namespace)
def config_read(self, config_path=None):
if config_path is None:
raise ValueError("please set train.model.hyper_parameters.config_path at first")
raise ValueError(
"please set train.model.hyper_parameters.config_path at first")
with open(config_path, "r") as fin:
item_nums = int(fin.readline().strip())
ins_nums = int(fin.readline().strip())
......@@ -46,100 +52,108 @@ class Model(ModelBase):
def input(self, bs):
self.items = fluid.data(
name="items",
shape=[bs, -1],
name="items", shape=[bs, -1],
dtype="int64") # [batch_size, uniq_max]
self.seq_index = fluid.data(
name="seq_index",
shape=[bs, -1, 2],
name="seq_index", shape=[bs, -1, 2],
dtype="int32") # [batch_size, seq_max, 2]
self.last_index = fluid.data(
name="last_index",
shape=[bs, 2],
dtype="int32") # [batch_size, 2]
name="last_index", shape=[bs, 2], dtype="int32") # [batch_size, 2]
self.adj_in = fluid.data(
name="adj_in",
shape=[bs, -1, -1],
name="adj_in", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.adj_out = fluid.data(
name="adj_out",
shape=[bs, -1, -1],
name="adj_out", shape=[bs, -1, -1],
dtype="float32") # [batch_size, seq_max, seq_max]
self.mask = fluid.data(
name="mask",
shape=[bs, -1, 1],
name="mask", shape=[bs, -1, 1],
dtype="float32") # [batch_size, seq_max, 1]
self.label = fluid.data(
name="label",
shape=[bs, 1],
dtype="int64") # [batch_size, 1]
name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1]
res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label]
res = [
self.items, self.seq_index, self.last_index, self.adj_in,
self.adj_out, self.mask, self.label
]
return res
def train_input(self):
res = self.input(self.train_batch_size)
self._data_var = res
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace)
use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
False, self._namespace)
if self._platform != "LINUX" or use_dataloader:
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=256,
use_double_buffer=False,
iterable=False)
def net(self, items_num, hidden_size, step, bs):
stdv = 1.0 / math.sqrt(hidden_size)
def embedding_layer(input, table_name, emb_dim, initializer_instance=None):
def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None):
emb = fluid.embedding(
input=input,
size=[items_num, emb_dim],
param_attr=fluid.ParamAttr(
name=table_name,
initializer=initializer_instance),
)
name=table_name, initializer=initializer_instance), )
return emb
sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer)
items_emb = embedding_layer(self.items, "emb", hidden_size,
sparse_initializer)
pre_state = items_emb
for i in range(step):
pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
pre_state = layers.reshape(
x=pre_state, shape=[bs, -1, hidden_size])
state_in = layers.fc(
input=pre_state,
name="state_in",
size=hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_out = layers.fc(
input=pre_state,
name="state_out",
size=hidden_size,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h]
state_adj_in = layers.matmul(self.adj_in,
state_in) # [batch_size, uniq_max, h]
state_adj_out = layers.matmul(
self.adj_out, state_out) # [batch_size, uniq_max, h]
gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
gru_fc = layers.fc(
input=gru_input,
name="gru_fc",
size=3 * hidden_size,
bias_attr=False)
gru_input = layers.reshape(
x=gru_input, shape=[-1, hidden_size * 2])
gru_fc = layers.fc(input=gru_input,
name="gru_fc",
size=3 * hidden_size,
bias_attr=False)
pre_state, _, _ = fluid.layers.gru_unit(
input=gru_fc,
hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]),
hidden=layers.reshape(
x=pre_state, shape=[-1, hidden_size]),
size=3 * hidden_size)
final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
......@@ -153,24 +167,22 @@ class Model(ModelBase):
bias_attr=False,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(
input=last,
name="last_fc",
size=hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [bathc_size, h]
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, h]
last_fc = layers.fc(input=last,
name="last_fc",
size=hidden_size,
bias_attr=False,
act=None,
num_flatten_dims=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [bathc_size, h]
seq_fc_t = layers.transpose(
seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h]
add = layers.elementwise_add(
seq_fc_t, last_fc) # [seq_max, batch_size, h]
add = layers.elementwise_add(seq_fc_t,
last_fc) # [seq_max, batch_size, h]
b = layers.create_parameter(
shape=[hidden_size],
dtype='float32',
......@@ -188,12 +200,13 @@ class Model(ModelBase):
act=None,
num_flatten_dims=2,
bias_attr=False,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-stdv, high=stdv))) # [batch_size, seq_max, 1]
weight *= self.mask
weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h]
weight_mask = layers.elementwise_mul(
seq, weight, axis=0) # [batch_size, seq_max, h]
global_attention = layers.reduce_sum(
weight_mask, dim=1) # [batch_size, h]
final_attention = layers.concat(
[global_attention, last], axis=1) # [batch_size, 2*h]
......@@ -213,7 +226,8 @@ class Model(ModelBase):
# persistable=True,
# name="all_vocab")
all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64')
all_vocab = fluid.layers.cast(
x=fluid.layers.assign(all_vocab), dtype='int64')
all_emb = fluid.embedding(
input=all_vocab,
......@@ -240,15 +254,19 @@ class Model(ModelBase):
def train_net(self):
self.train_input()
self.net(self.items_num, self.hidden_size, self.step, self.train_batch_size)
self.net(self.items_num, self.hidden_size, self.step,
self.train_batch_size)
self.avg_loss()
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
step_per_epoch = self.ins_num // self.train_batch_size
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace)
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
......@@ -266,10 +284,14 @@ class Model(ModelBase):
self._infer_data_var = res
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
self.infer_input()
self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size)
self.net(self.items_num, self.hidden_size, self.step,
self.evaluate_batch_size)
self._infer_results['acc'] = self.acc
self._infer_results['loss'] = self.loss
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import pickle
......@@ -10,6 +24,7 @@ parser.add_argument(
help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample')
opt = parser.parse_args()
def process_data(file_type):
path = os.path.join(opt.data_dir, file_type)
output_path = os.path.splitext(path)[0] + ".txt"
......@@ -23,6 +38,7 @@ def process_data(file_type):
fout.write(str(data[i][1]))
fout.write("\n")
process_data("train")
process_data("test")
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
import sys
import time
......
......@@ -23,7 +23,8 @@ from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
self.batch_size = envs.get_global_env("batch_size", None, "train.reader")
self.batch_size = envs.get_global_env("batch_size", None,
"train.reader")
self.input = []
self.length = None
......@@ -34,7 +35,8 @@ class TrainReader(Reader):
with open(f, "r") as fin:
for line in fin:
line = line.strip().split('\t')
res.append(tuple([map(int, line[0].split(',')), int(line[1])]))
res.append(
tuple([map(int, line[0].split(',')), int(line[1])]))
return res
def make_data(self, cur_batch, batch_size):
......@@ -75,10 +77,8 @@ class TrainReader(Reader):
u_deg_out[np.where(u_deg_out == 0)] = 1
adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose())
seq_index.append(
[[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append(
[id, np.where(node == e[0][last_id[id]])[0][0]])
seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]])
last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]])
label.append(e[1] - 1)
mask.append([[1] * (last_id[id] + 1) + [0] *
(max_seq_len - last_id[id] - 1)])
......@@ -101,10 +101,13 @@ class TrainReader(Reader):
def _reader():
random.shuffle(self.input)
group_remain = self.length % batch_group_size
for bg_id in range(0, self.length - group_remain, batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size])
for bg_id in range(0, self.length - group_remain,
batch_group_size):
cur_bg = copy.deepcopy(self.input[bg_id:bg_id +
batch_group_size])
if train:
cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True)
cur_bg = sorted(
cur_bg, key=lambda x: len(x[0]), reverse=True)
for i in range(0, batch_group_size, batch_size):
cur_batch = cur_bg[i:i + batch_size]
yield self.make_data(cur_batch, batch_size)
......
......@@ -24,14 +24,22 @@ class Model(ModelBase):
def all_vocab_network(self, is_infer=False):
""" network definition """
recall_k = envs.get_global_env("hyper_parameters.recall_k", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
hid_size = envs.get_global_env("hyper_parameters.hid_size", None, self._namespace)
init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound", None, self._namespace)
init_high_bound = envs.get_global_env("hyper_parameters.init_high_bound", None, self._namespace)
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None, self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None, self._namespace)
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None, self._namespace)
recall_k = envs.get_global_env("hyper_parameters.recall_k", None,
self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
hid_size = envs.get_global_env("hyper_parameters.hid_size", None,
self._namespace)
init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound",
None, self._namespace)
init_high_bound = envs.get_global_env(
"hyper_parameters.init_high_bound", None, self._namespace)
emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None,
self._namespace)
gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None,
self._namespace)
fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None,
self._namespace)
# Input data
src_wordseq = fluid.data(
name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1)
......@@ -41,7 +49,10 @@ class Model(ModelBase):
if is_infer:
self._infer_data_var = [src_wordseq, dst_wordseq]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
emb = fluid.embedding(
input=src_wordseq,
......@@ -56,7 +67,8 @@ class Model(ModelBase):
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
low=init_low_bound,
high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
......
......@@ -25,9 +25,12 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input_data(self, is_infer=False):
user_input = fluid.data(name="user_input", shape=[-1, 1], dtype="int64", lod_level=0)
item_input = fluid.data(name="item_input", shape=[-1, 1], dtype="int64", lod_level=0)
label = fluid.data(name="label", shape=[-1, 1], dtype="int64", lod_level=0)
user_input = fluid.data(
name="user_input", shape=[-1, 1], dtype="int64", lod_level=0)
item_input = fluid.data(
name="item_input", shape=[-1, 1], dtype="int64", lod_level=0)
label = fluid.data(
name="label", shape=[-1, 1], dtype="int64", lod_level=0)
if is_infer:
inputs = [user_input] + [item_input]
else:
......@@ -35,81 +38,104 @@ class Model(ModelBase):
self._data_var = inputs
return inputs
def net(self, inputs, is_infer=False):
num_users = envs.get_global_env("hyper_parameters.num_users", None, self._namespace)
num_items = envs.get_global_env("hyper_parameters.num_items", None, self._namespace)
latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None, self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace)
num_layer = len(layers) #Number of layers in the MLP
MF_Embedding_User = fluid.embedding(input=inputs[0],
size=[num_users, latent_dim],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
MF_Embedding_Item = fluid.embedding(input=inputs[1],
size=[num_items, latent_dim],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_User = fluid.embedding(input=inputs[0],
size=[num_users, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_Item = fluid.embedding(input=inputs[1],
size=[num_items, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01),
is_sparse=True)
num_users = envs.get_global_env("hyper_parameters.num_users", None,
self._namespace)
num_items = envs.get_global_env("hyper_parameters.num_items", None,
self._namespace)
latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None,
self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None,
self._namespace)
num_layer = len(layers) #Number of layers in the MLP
MF_Embedding_User = fluid.embedding(
input=inputs[0],
size=[num_users, latent_dim],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
MF_Embedding_Item = fluid.embedding(
input=inputs[1],
size=[num_items, latent_dim],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_User = fluid.embedding(
input=inputs[0],
size=[num_users, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
MLP_Embedding_Item = fluid.embedding(
input=inputs[1],
size=[num_items, int(layers[0] / 2)],
param_attr=fluid.initializer.Normal(
loc=0.0, scale=0.01),
is_sparse=True)
# MF part
mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1)
mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1)
mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent)
mf_vector = fluid.layers.elementwise_mul(mf_user_latent,
mf_item_latent)
# MLP part
# The 0-th layer is the concatenation of embedding layers
mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1)
mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1)
mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1)
mlp_vector = fluid.layers.concat(
input=[mlp_user_latent, mlp_item_latent], axis=-1)
for i in range(1, num_layer):
mlp_vector = fluid.layers.fc(input=mlp_vector,
size=layers[i],
act='relu',
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)),
name='layer_' + str(i))
mlp_vector = fluid.layers.fc(
input=mlp_vector,
size=layers[i],
act='relu',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.TruncatedNormal(
loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
name='layer_' + str(i))
# Concatenate MF and MLP parts
predict_vector = fluid.layers.concat(input=[mf_vector, mlp_vector], axis=-1)
predict_vector = fluid.layers.concat(
input=[mf_vector, mlp_vector], axis=-1)
# Final prediction layer
prediction = fluid.layers.fc(input=predict_vector,
size=1,
act='sigmoid',
param_attr=fluid.initializer.MSRAInitializer(uniform=True),
name='prediction')
prediction = fluid.layers.fc(
input=predict_vector,
size=1,
act='sigmoid',
param_attr=fluid.initializer.MSRAInitializer(uniform=True),
name='prediction')
if is_infer:
self._infer_results["prediction"] = prediction
return
cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32'))
cost = fluid.layers.log_loss(
input=prediction,
label=fluid.layers.cast(
x=inputs[2], dtype='float32'))
avg_cost = fluid.layers.mean(cost)
self._cost = avg_cost
self._metrics["cost"] = avg_cost
def train_net(self):
input_data = self.input_data()
self.net(input_data)
def infer_net(self):
self._infer_data_var = self.input_data(is_infer=True)
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
self.net(self._infer_data_var, is_infer=True)
......@@ -33,7 +33,9 @@ class EvaluateReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features = line.strip().split(',')
feature_name = ["user_input", "item_input"]
yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]])
yield zip(feature_name,
[[int(features[0])]] + [[int(features[1])]])
return reader
......@@ -33,10 +33,9 @@ class TrainReader(Reader):
This function needs to be implemented by the user, based on data format
"""
features = line.strip().split(',')
feature_name = ["user_input", "item_input", "label"]
yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]] + [[int(features[2])]])
yield zip(feature_name, [[int(features[0])]] +
[[int(features[1])]] + [[int(features[2])]])
return reader
......@@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
| MOVIELENS | NCF | 0.688 | -- |
| -- | Youtube | -- | -- |
| 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 |
......@@ -79,9 +79,12 @@ class Model(ModelBase):
return correct
def train(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None,
self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None,
self._namespace)
emb_shape = [vocab_size, emb_dim]
self.user_encoder = GrnnEncoder()
......@@ -131,24 +134,34 @@ class Model(ModelBase):
self.train()
def infer(self):
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace)
vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None,
self._namespace)
emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None,
self._namespace)
hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None,
self._namespace)
user_data = fluid.data(
name="user", shape=[None, 1], dtype="int64", lod_level=1)
all_item_data = fluid.data(
name="all_item", shape=[None, vocab_size], dtype="int64")
pos_label = fluid.data(name="pos_label", shape=[None, 1], dtype="int64")
pos_label = fluid.data(
name="pos_label", shape=[None, 1], dtype="int64")
self._infer_data_var = [user_data, all_item_data, pos_label]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
user_emb = fluid.embedding(
input=user_data, size=[vocab_size, emb_dim], param_attr="emb.item")
all_item_emb = fluid.embedding(
input=all_item_data, size=[vocab_size, emb_dim], param_attr="emb.item")
all_item_emb_re = fluid.layers.reshape(x=all_item_emb, shape=[-1, emb_dim])
input=all_item_data,
size=[vocab_size, emb_dim],
param_attr="emb.item")
all_item_emb_re = fluid.layers.reshape(
x=all_item_emb, shape=[-1, emb_dim])
user_encoder = GrnnEncoder()
user_enc = user_encoder.forward(user_emb)
......@@ -156,7 +169,8 @@ class Model(ModelBase):
size=hidden_size,
param_attr='user.w',
bias_attr="user.b")
user_exp = fluid.layers.expand(x=user_hid, expand_times=[1, vocab_size])
user_exp = fluid.layers.expand(
x=user_hid, expand_times=[1, vocab_size])
user_re = fluid.layers.reshape(x=user_exp, shape=[-1, hidden_size])
all_item_hid = fluid.layers.fc(input=all_item_emb_re,
......
......@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters")
self.vocab_size = envs.get_global_env("vocab_size", 10,
"train.model.hyper_parameters")
def generate_sample(self, line):
"""
......@@ -39,6 +40,9 @@ class EvaluateReader(Reader):
src = conv_ids[:boundary]
pos_tgt = [conv_ids[boundary]]
feature_name = ["user", "all_item", "p_item"]
yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt])
yield zip(
feature_name,
[src] + [np.arange(self.vocab_size).astype("int64").tolist()] +
[pos_tgt])
return reader
......@@ -24,46 +24,57 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input(self):
neg_num = int(envs.get_global_env(
"hyper_parameters.neg_num", None, self._namespace))
self.input_word = fluid.data(name="input_word", shape=[
None, 1], dtype='int64')
self.true_word = fluid.data(name='true_label', shape=[
None, 1], dtype='int64')
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
self.input_word = fluid.data(
name="input_word", shape=[None, 1], dtype='int64')
self.true_word = fluid.data(
name='true_label', shape=[None, 1], dtype='int64')
self._data_var.append(self.input_word)
self._data_var.append(self.true_word)
with_shuffle_batch = bool(int(envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, self._namespace)))
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
if not with_shuffle_batch:
self.neg_word = fluid.data(name="neg_label", shape=[
None, neg_num], dtype='int64')
self.neg_word = fluid.data(
name="neg_label", shape=[None, neg_num], dtype='int64')
self._data_var.append(self.neg_word)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def net(self):
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
neg_num = int(envs.get_global_env(
"hyper_parameters.neg_num", None, self._namespace))
neg_num = int(
envs.get_global_env("hyper_parameters.neg_num", None,
self._namespace))
sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number", None, self._namespace)
sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim", None, self._namespace)
with_shuffle_batch = bool(int(envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, self._namespace)))
with_shuffle_batch = bool(
int(
envs.get_global_env("hyper_parameters.with_shuffle_batch",
None, self._namespace)))
def embedding_layer(input, table_name, emb_dim, initializer_instance=None, squeeze=False):
def embedding_layer(input,
table_name,
emb_dim,
initializer_instance=None,
squeeze=False):
emb = fluid.embedding(
input=input,
is_sparse=True,
is_distributed=is_distributed,
size=[sparse_feature_number, emb_dim],
param_attr=fluid.ParamAttr(
name=table_name,
initializer=initializer_instance),
)
name=table_name, initializer=initializer_instance), )
if squeeze:
return fluid.layers.squeeze(input=emb, axes=[1])
else:
......@@ -73,35 +84,38 @@ class Model(ModelBase):
emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer(
self.input_word, "emb", sparse_feature_dim, emb_initializer, True)
true_emb_w = embedding_layer(
self.true_word, "emb_w", sparse_feature_dim, emb_w_initializer, True)
true_emb_b = embedding_layer(
self.true_word, "emb_b", 1, emb_w_initializer, True)
input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim,
emb_initializer, True)
true_emb_w = embedding_layer(self.true_word, "emb_w",
sparse_feature_dim, emb_w_initializer,
True)
true_emb_b = embedding_layer(self.true_word, "emb_b", 1,
emb_w_initializer, True)
if with_shuffle_batch:
neg_emb_w_list = []
for i in range(neg_num):
neg_emb_w_list.append(fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim])
neg_emb_b_list = []
for i in range(neg_num):
neg_emb_b_list.append(fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_b)) # shuffle true_word
neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
else:
neg_emb_w = embedding_layer(
self.neg_word, "emb_w", sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer(
self.neg_word, "emb_b", 1, emb_w_initializer)
neg_emb_w = embedding_layer(self.neg_word, "emb_w",
sparse_feature_dim, emb_w_initializer)
neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1,
emb_w_initializer)
neg_emb_b_vec = fluid.layers.reshape(
neg_emb_b, shape=[-1, neg_num])
......@@ -117,7 +131,8 @@ class Model(ModelBase):
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.elementwise_add(
fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]),
fluid.layers.reshape(
neg_matmul, shape=[-1, neg_num]),
neg_emb_b_vec)
label_ones = fluid.layers.fill_constant_batch_size_like(
......@@ -136,9 +151,17 @@ class Model(ModelBase):
neg_xent, dim=1))
self.avg_cost = fluid.layers.reduce_mean(cost)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
......@@ -155,12 +178,12 @@ class Model(ModelBase):
self.metrics()
def optimizer(self):
learning_rate = envs.get_global_env(
"hyper_parameters.learning_rate", None, self._namespace)
decay_steps = envs.get_global_env(
"hyper_parameters.decay_steps", None, self._namespace)
decay_rate = envs.get_global_env(
"hyper_parameters.decay_rate", None, self._namespace)
learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
None, self._namespace)
decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
self._namespace)
decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
self._namespace)
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=learning_rate,
......@@ -180,11 +203,15 @@ class Model(ModelBase):
name="analogy_c", shape=[None], dtype='int64')
self.analogy_d = fluid.data(
name="analogy_d", shape=[None], dtype='int64')
self._infer_data_var = [self.analogy_a,
self.analogy_b, self.analogy_c, self.analogy_d]
self._infer_data_var = [
self.analogy_a, self.analogy_b, self.analogy_c, self.analogy_d
]
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def infer_net(self):
sparse_feature_dim = envs.get_global_env(
......@@ -216,18 +243,28 @@ class Model(ModelBase):
dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(fluid.layers.unsqueeze(
self.analogy_d, axes=[1]), expand_times=[1, 4])
label = fluid.layers.expand(
fluid.layers.unsqueeze(
self.analogy_d, axes=[1]),
expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(
input=fluid.layers.cast(fluid.layers.equal(pred_idx, label), dtype='float32'))
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0)
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
......
......@@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir test_data/
rm -rf raw_data
......@@ -49,8 +49,7 @@ def parse_args():
'--file_nums',
type=int,
default=1024,
help="re-split input corpus file nums"
)
help="re-split input corpus file nums")
parser.add_argument(
'--downsample',
type=float,
......@@ -137,9 +136,11 @@ def filter_corpus(args):
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
with io.open(args.output_corpus_dir + '/convert_' + file + '.csv', "w") as wf:
with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
"w") as wf:
with io.open(
args.input_corpus_dir + '/' + file, encoding='utf-8') as rf:
args.input_corpus_dir + '/' + file,
encoding='utf-8') as rf:
print(args.input_corpus_dir + '/' + file)
for line in rf:
signal = False
......@@ -154,9 +155,9 @@ def filter_corpus(args):
count_w = id_counts[idx]
corpus_size = word_all_count
keep_prob = (
math.sqrt(count_w /
(args.downsample * corpus_size)) + 1
) * (args.downsample * corpus_size) / count_w
math.sqrt(count_w /
(args.downsample * corpus_size)) + 1
) * (args.downsample * corpus_size) / count_w
r_value = random.random()
if r_value > keep_prob:
continue
......@@ -182,7 +183,8 @@ def build_dict(args):
for file in os.listdir(args.build_dict_corpus_dir):
with io.open(
args.build_dict_corpus_dir + "/" + file, encoding='utf-8') as f:
args.build_dict_corpus_dir + "/" + file,
encoding='utf-8') as f:
print("build dict : ", args.build_dict_corpus_dir + "/" + file)
for line in f:
line = text_strip(line)
......@@ -232,7 +234,8 @@ def data_split(args):
for i in range(1, num + 1):
with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))]
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
len(contents))]
for line in data:
fout.write(line)
......
......@@ -22,7 +22,8 @@ from paddlerec.core.utils import envs
class EvaluateReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader")
dict_path = envs.get_global_env("word_id_dict_path", None,
"evaluate.reader")
self.word_to_id = dict()
self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
......@@ -68,14 +69,17 @@ class EvaluateReader(Reader):
a unicode string - a space-delimited sequence of words.
"""
return u" ".join([
word if word in original_vocab else u"<UNK>" for word in line.split()
word if word in original_vocab else u"<UNK>"
for word in line.split()
])
def generate_sample(self, line):
def reader():
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]),
('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])]
yield [('analogy_a', [self.word_to_id[features[0]]]),
('analogy_b', [self.word_to_id[features[1]]]),
('analogy_c', [self.word_to_id[features[2]]]),
('analogy_d', [self.word_to_id[features[3]]])]
return reader
......@@ -40,10 +40,14 @@ class NumpyRandomInt(object):
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader")
self.window_size = envs.get_global_env("hyper_parameters.window_size", None, "train.model")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, "train.model")
self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch", None, "train.model")
dict_path = envs.get_global_env("word_count_dict_path", None,
"train.reader")
self.window_size = envs.get_global_env("hyper_parameters.window_size",
None, "train.model")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None,
"train.model")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch", None, "train.model")
self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.cs = None
......@@ -81,13 +85,15 @@ class TrainReader(Reader):
def reader():
word_ids = [w for w in line.split()]
for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words(
word_ids, idx)
context_word_ids = self.get_context_words(word_ids, idx)
for context_id in context_word_ids:
output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])]
output = [('input_word', [int(target_id)]),
('true_label', [int(context_id)])]
if not self.with_shuffle_batch:
neg_array = self.cs.searchsorted(np.random.sample(self.neg_num))
output += [('neg_label', [int(str(i)) for i in neg_array])]
neg_array = self.cs.searchsorted(
np.random.sample(self.neg_num))
output += [('neg_label',
[int(str(i)) for i in neg_array])]
yield output
return reader
......@@ -25,14 +25,20 @@ class Model(ModelBase):
ModelBase.__init__(self, config)
def input_data(self, is_infer=False):
watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, self._namespace)
search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, self._namespace)
other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, self._namespace)
watch_vec = fluid.data(name="watch_vec", shape=[None, watch_vec_size], dtype="float32")
search_vec = fluid.data(name="search_vec", shape=[None, search_vec_size], dtype="float32")
other_feat = fluid.data(name="other_feat", shape=[None, other_feat_size], dtype="float32")
watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size",
None, self._namespace)
search_vec_size = envs.get_global_env(
"hyper_parameters.search_vec_size", None, self._namespace)
other_feat_size = envs.get_global_env(
"hyper_parameters.other_feat_size", None, self._namespace)
watch_vec = fluid.data(
name="watch_vec", shape=[None, watch_vec_size], dtype="float32")
search_vec = fluid.data(
name="search_vec", shape=[None, search_vec_size], dtype="float32")
other_feat = fluid.data(
name="other_feat", shape=[None, other_feat_size], dtype="float32")
label = fluid.data(name="label", shape=[None, 1], dtype="int64")
inputs = [watch_vec] + [search_vec] + [other_feat] + [label]
self._data_var = inputs
......@@ -41,27 +47,32 @@ class Model(ModelBase):
def fc(self, tag, data, out_dim, active='relu'):
init_stddev = 1.0
scales = 1.0 / np.sqrt(data.shape[1])
scales = 1.0 / np.sqrt(data.shape[1])
if tag == 'l4':
p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales))
p_attr = fluid.param_attr.ParamAttr(
name='%s_weight' % tag,
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=init_stddev * scales))
else:
p_attr = None
b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
b_attr = fluid.ParamAttr(
name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
out = fluid.layers.fc(input=data,
size=out_dim,
act=active,
param_attr=p_attr,
bias_attr =b_attr,
name=tag)
size=out_dim,
act=active,
param_attr=p_attr,
bias_attr=b_attr,
name=tag)
return out
def net(self, inputs):
output_size = envs.get_global_env("hyper_parameters.output_size", None, self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace)
output_size = envs.get_global_env("hyper_parameters.output_size", None,
self._namespace)
layers = envs.get_global_env("hyper_parameters.layers", None,
self._namespace)
concat_feats = fluid.layers.concat(input=inputs[:-1], axis=-1)
l1 = self.fc('l1', concat_feats, layers[0], 'relu')
......
......@@ -21,10 +21,14 @@ import numpy as np
class TrainReader(Reader):
def init(self):
self.watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, "train.model")
self.search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, "train.model")
self.other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, "train.model")
self.output_size = envs.get_global_env("hyper_parameters.output_size", None, "train.model")
self.watch_vec_size = envs.get_global_env(
"hyper_parameters.watch_vec_size", None, "train.model")
self.search_vec_size = envs.get_global_env(
"hyper_parameters.search_vec_size", None, "train.model")
self.other_feat_size = envs.get_global_env(
"hyper_parameters.other_feat_size", None, "train.model")
self.output_size = envs.get_global_env("hyper_parameters.output_size",
None, "train.model")
def generate_sample(self, line):
"""
......@@ -35,13 +39,12 @@ class TrainReader(Reader):
"""
This function needs to be implemented by the user, based on data format
"""
feature_name = ["watch_vec", "search_vec", "other_feat", "label"]
yield zip(feature_name, [np.random.rand(self.watch_vec_size).tolist()] +
[np.random.rand(self.search_vec_size).tolist()] +
[np.random.rand(self.other_feat_size).tolist()] +
[[np.random.randint(self.output_size)]] )
yield zip(feature_name,
[np.random.rand(self.watch_vec_size).tolist()] +
[np.random.rand(self.search_vec_size).tolist()] +
[np.random.rand(self.other_feat_size).tolist()] +
[[np.random.randint(self.output_size)]])
return reader
......@@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效
- 如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。
- 训练数据如何组织?答:tdm的训练数据主要为:`user/query emb``item`的正样本,`item`需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的`tdm-sampler op`完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。
- 大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。
3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
\ No newline at end of file
3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -25,38 +25,38 @@ class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
# tree meta hyper parameters
self.max_layers = envs.get_global_env(
"tree_parameters.max_layers", 4, self._namespace)
self.node_nums = envs.get_global_env(
"tree_parameters.node_nums", 26, self._namespace)
self.max_layers = envs.get_global_env("tree_parameters.max_layers", 4,
self._namespace)
self.node_nums = envs.get_global_env("tree_parameters.node_nums", 26,
self._namespace)
self.leaf_node_nums = envs.get_global_env(
"tree_parameters.leaf_node_nums", 13, self._namespace)
self.output_positive = envs.get_global_env(
"tree_parameters.output_positive", True, self._namespace)
self.layer_node_num_list = envs.get_global_env(
"tree_parameters.layer_node_num_list", [
2, 4, 7, 12], self._namespace)
self.child_nums = envs.get_global_env(
"tree_parameters.child_nums", 2, self._namespace)
self.tree_layer_path = envs.get_global_env(
"tree.tree_layer_path", None, "train.startup")
"tree_parameters.layer_node_num_list", [2, 4, 7,
12], self._namespace)
self.child_nums = envs.get_global_env("tree_parameters.child_nums", 2,
self._namespace)
self.tree_layer_path = envs.get_global_env("tree.tree_layer_path",
None, "train.startup")
# model training hyper parameter
self.node_emb_size = envs.get_global_env(
"hyper_parameters.node_emb_size", 64, self._namespace)
self.input_emb_size = envs.get_global_env(
"hyper_parameters.input_emb_size", 768, self._namespace)
self.act = envs.get_global_env(
"hyper_parameters.act", "tanh", self._namespace)
self.act = envs.get_global_env("hyper_parameters.act", "tanh",
self._namespace)
self.neg_sampling_list = envs.get_global_env(
"hyper_parameters.neg_sampling_list", [
1, 2, 3, 4], self._namespace)
"hyper_parameters.neg_sampling_list", [1, 2, 3,
4], self._namespace)
# model infer hyper parameter
self.topK = envs.get_global_env(
"hyper_parameters.node_nums", 1, self._namespace)
self.batch_size = envs.get_global_env(
"batch_size", 1, "evaluate.reader")
self.topK = envs.get_global_env("hyper_parameters.node_nums", 1,
self._namespace)
self.batch_size = envs.get_global_env("batch_size", 1,
"evaluate.reader")
def train_net(self):
self.train_input()
......@@ -76,21 +76,22 @@ class Model(ModelBase):
input_emb = fluid.data(
name="input_emb",
shape=[None, self.input_emb_size],
dtype="float32",
)
dtype="float32", )
self._data_var.append(input_emb)
item_label = fluid.data(
name="item_label",
shape=[None, 1],
dtype="int64",
)
dtype="int64", )
self._data_var.append(item_label)
if self._platform != "LINUX":
self._data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def tdm_net(self):
"""
......@@ -116,8 +117,7 @@ class Model(ModelBase):
output_list=True,
seed=0,
tree_dtype='int64',
dtype='int64'
)
dtype='int64')
# 查表得到每个节点的Embedding
sample_nodes_emb = [
......@@ -125,35 +125,34 @@ class Model(ModelBase):
input=sample_nodes[i],
is_sparse=True,
size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr(
name="TDM_Tree_Emb")
) for i in range(self.max_layers)
param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
for i in range(self.max_layers)
]
# 此处进行Reshape是为了之后层次化的分类器训练
sample_nodes_emb = [
fluid.layers.reshape(sample_nodes_emb[i],
[-1, self.neg_sampling_list[i] +
self.output_positive, self.node_emb_size]
) for i in range(self.max_layers)
fluid.layers.reshape(sample_nodes_emb[i], [
-1, self.neg_sampling_list[i] + self.output_positive,
self.node_emb_size
]) for i in range(self.max_layers)
]
# 对输入的input_emb进行转换,使其维度与node_emb维度一致
input_trans_emb = self.input_trans_layer(input_emb)
# 分类器的主体网络,分别训练不同层次的分类器
layer_classifier_res = self.classifier_layer(
input_trans_emb, sample_nodes_emb)
layer_classifier_res = self.classifier_layer(input_trans_emb,
sample_nodes_emb)
# 最后的概率判别FC,将所有层次的node分类结果放到一起以相同的标准进行判别
# 考虑到树极大可能不平衡,有些item不在最后一层,所以需要这样的机制保证每个item都有机会被召回
tdm_fc = fluid.layers.fc(input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
tdm_fc = fluid.layers.fc(
input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
# 将loss打平,放到一起计算整体网络的loss
tdm_fc_re = fluid.layers.reshape(tdm_fc, [-1, 2])
......@@ -202,7 +201,7 @@ class Model(ModelBase):
def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self._predict,
label=self.mask_label,
num_thresholds=2 ** 12,
num_thresholds=2**12,
slide_steps=20)
self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc
......@@ -218,8 +217,7 @@ class Model(ModelBase):
size=self.node_emb_size,
act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"),
)
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
# 将input_emb映射到各个不同层次的向量表示空间
input_layer_fc_out = [
......@@ -229,8 +227,9 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)),
) for i in range(self.max_layers)
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias." + str(i)), )
for i in range(self.max_layers)
]
return input_layer_fc_out
......@@ -246,20 +245,22 @@ class Model(ModelBase):
input_layer_unsequeeze, expand_times=[1, node.shape[1], 1])
else:
input_layer_expand = fluid.layers.expand(
input_layer_unsequeeze, expand_times=[1, node[layer_idx].shape[1], 1])
input_layer_unsequeeze,
expand_times=[1, node[layer_idx].shape[1], 1])
return input_layer_expand
def classifier_layer(self, input, node):
# 扩展input,使维度与node匹配
input_expand = [
self._expand_layer(input[i], node, i) for i in range(self.max_layers)
self._expand_layer(input[i], node, i)
for i in range(self.max_layers)
]
# 将input_emb与node_emb concat到一起过分类器FC
input_node_concat = [
fluid.layers.concat(
input=[input_expand[i], node[i]],
axis=2) for i in range(self.max_layers)
input=[input_expand[i], node[i]], axis=2)
for i in range(self.max_layers)
]
hidden_states_fc = [
fluid.layers.fc(
......@@ -269,8 +270,8 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i))
) for i in range(self.max_layers)
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)))
for i in range(self.max_layers)
]
# 如果将所有层次的node放到一起计算loss,则需要在此处concat
......@@ -285,12 +286,14 @@ class Model(ModelBase):
input_emb = fluid.layers.data(
name="input_emb",
shape=[self.input_emb_size],
dtype="float32",
)
dtype="float32", )
self._infer_data_var.append(input_emb)
self._infer_data_loader = fluid.io.DataLoader.from_generator(
feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
feed_list=self._infer_data_var,
capacity=64,
use_double_buffer=False,
iterable=False)
def get_layer_list(self):
"""get layer list from layer_list.txt"""
......@@ -318,10 +321,12 @@ class Model(ModelBase):
node_list = []
mask_list = []
for id in first_layer_node:
node_list.append(fluid.layers.fill_constant(
[self.batch_size, 1], value=int(id), dtype='int64'))
mask_list.append(fluid.layers.fill_constant(
[self.batch_size, 1], value=0, dtype='int64'))
node_list.append(
fluid.layers.fill_constant(
[self.batch_size, 1], value=int(id), dtype='int64'))
mask_list.append(
fluid.layers.fill_constant(
[self.batch_size, 1], value=0, dtype='int64'))
self.first_layer_node = fluid.layers.concat(node_list, axis=1)
self.first_layer_node_mask = fluid.layers.concat(mask_list, axis=1)
......@@ -359,28 +364,26 @@ class Model(ModelBase):
size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
input_fc_out = self.layer_fc_infer(
input_trans_emb, layer_idx)
input_fc_out = self.layer_fc_infer(input_trans_emb, layer_idx)
# 过每一层的分类器
layer_classifier_res = self.classifier_layer_infer(input_fc_out,
node_emb,
layer_idx)
layer_classifier_res = self.classifier_layer_infer(
input_fc_out, node_emb, layer_idx)
# 过最终的判别分类器
tdm_fc = fluid.layers.fc(input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
tdm_fc = fluid.layers.fc(
input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
prob = fluid.layers.softmax(tdm_fc)
positive_prob = fluid.layers.slice(
prob, axes=[2], starts=[1], ends=[2])
prob_re = fluid.layers.reshape(
positive_prob, [-1, current_layer_node_num])
prob_re = fluid.layers.reshape(positive_prob,
[-1, current_layer_node_num])
# 过滤掉padding产生的无效节点(node_id=0)
node_zero_mask = fluid.layers.cast(current_layer_node, 'bool')
......@@ -395,11 +398,11 @@ class Model(ModelBase):
# index_sample op根据下标索引tensor对应位置的值
# 若paddle版本>2.0,调用方式为paddle.index_sample
top_node = fluid.contrib.layers.index_sample(
current_layer_node, topk_i)
top_node = fluid.contrib.layers.index_sample(current_layer_node,
topk_i)
prob_re_mask = prob_re * current_layer_node_mask # 过滤掉非叶子节点
topk_value = fluid.contrib.layers.index_sample(
prob_re_mask, topk_i)
topk_value = fluid.contrib.layers.index_sample(prob_re_mask,
topk_i)
node_score.append(topk_value)
node_list.append(top_node)
......@@ -424,7 +427,8 @@ class Model(ModelBase):
res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1])
# 利用Tree_info信息,将node_id转换为item_id
tree_info = fluid.default_main_program().global_block().var("TDM_Tree_Info")
tree_info = fluid.default_main_program().global_block().var(
"TDM_Tree_Info")
res_node_emb = fluid.layers.gather_nd(tree_info, res_node)
res_item = fluid.layers.slice(
......@@ -442,8 +446,7 @@ class Model(ModelBase):
size=self.node_emb_size,
act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"),
)
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
return input_fc_out
def layer_fc_infer(self, input_fc_out, layer_idx):
......@@ -458,8 +461,7 @@ class Model(ModelBase):
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias." + str(layer_idx)),
)
name="trans.layer_fc.bias." + str(layer_idx)), )
return input_layer_fc_out
def classifier_layer_infer(self, input, node, layer_idx):
......@@ -480,5 +482,6 @@ class Model(ModelBase):
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx)))
bias_attr=fluid.ParamAttr(
name="cls.concat_fc.bias." + str(layer_idx)))
return hidden_states_fc
1,2
3,4,5,6
7,8,9,10,11,12,13
14,15,16,17,18,19,20,21,22,23,24,25
\ No newline at end of file
14,15,16,17,18,19,20,21,22,23,24,25
......@@ -26,8 +26,10 @@ from paddlerec.core.utils import util
engines = {}
device = ["CPU", "GPU"]
clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"]
engine_choices = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER",
"TDM_SINGLE", "TDM_LOCAL_CLUSTER", "TDM_CLUSTER"]
engine_choices = [
"SINGLE", "LOCAL_CLUSTER", "CLUSTER", "TDM_SINGLE", "TDM_LOCAL_CLUSTER",
"TDM_CLUSTER"
]
custom_model = ['TDM']
model_name = ""
......@@ -66,7 +68,8 @@ def get_engine(args):
engine = engine.upper()
if engine not in engine_choices:
raise ValueError("train.engin can not be chosen in {}".format(engine_choices))
raise ValueError("train.engin can not be chosen in {}".format(
engine_choices))
print("engines: \n{}".format(engines))
......@@ -77,8 +80,10 @@ def get_engine(args):
def get_transpiler():
FNULL = open(os.devnull, 'w')
cmd = ["python", "-c",
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"]
cmd = [
"python", "-c",
"import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"
]
proc = subprocess.Popen(cmd, stdout=FNULL, stderr=FNULL, cwd=os.getcwd())
ret = proc.wait()
if ret == -11:
......@@ -152,7 +157,8 @@ def cluster_engine(args):
update_workspace(flattens)
envs.set_runtime_environs(flattens)
print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value")))
print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"
)))
launch = ClusterEngine(None, args.model)
return launch
......@@ -163,7 +169,8 @@ def cluster_engine(args):
cluster_envs = {}
cluster_envs["train.trainer.trainer"] = trainer
cluster_envs["train.trainer.engine"] = "cluster"
cluster_envs["train.trainer.threads"] = envs.get_runtime_environ("CPU_NUM")
cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
"CPU_NUM")
cluster_envs["train.trainer.platform"] = envs.get_platform()
print("launch {} engine with cluster to with model: {}".format(
trainer, args.model))
......@@ -181,7 +188,8 @@ def cluster_engine(args):
def cluster_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model))
print("launch cluster engine with cluster to run model: {}".format(
args.model))
cluster_envs = {}
cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer"
......@@ -209,7 +217,8 @@ def local_cluster_engine(args):
cluster_envs["train.trainer.platform"] = envs.get_platform()
cluster_envs["CPU_NUM"] = "2"
print("launch {} engine with cluster to run model: {}".format(trainer, args.model))
print("launch {} engine with cluster to run model: {}".format(trainer,
args.model))
set_runtime_envs(cluster_envs, args.model)
launch = LocalClusterEngine(cluster_envs, args.model)
......@@ -217,10 +226,12 @@ def local_cluster_engine(args):
def local_mpi_engine(args):
print("launch cluster engine with cluster to run model: {}".format(args.model))
print("launch cluster engine with cluster to run model: {}".format(
args.model))
from paddlerec.core.engine.local_mpi import LocalMPIEngine
print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(args.model))
print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(
args.model))
mpi = util.run_which("mpirun")
if not mpi:
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
setup for paddle-rec.
"""
......@@ -22,11 +21,7 @@ from setuptools import setup, find_packages
import shutil
import tempfile
requires = [
"paddlepaddle == 1.7.2",
"pyyaml >= 5.1.1"
]
requires = ["paddlepaddle == 1.7.2", "pyyaml >= 5.1.1"]
about = {}
about["__title__"] = "paddle-rec"
......@@ -48,18 +43,27 @@ def build(dirname):
package_dir = os.path.dirname(os.path.abspath(__file__))
run_cmd("cp -r {}/* {}".format(package_dir, dirname))
run_cmd("mkdir {}".format(os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec")))
run_cmd("mv {} {}".format(
os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec")))
packages = find_packages(dirname, include=('paddlerec.*'))
package_dir = {'': dirname}
package_data = {}
models_copy = ['data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'tree/*.txt']
models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt'
]
engine_copy = ['*/*.sh']
for package in packages:
if package.startswith("paddlerec.models."):
......@@ -80,8 +84,7 @@ def build(dirname):
package_data=package_data,
python_requires=">=2.7",
install_requires=requires,
zip_safe=False
)
zip_safe=False)
dirname = tempfile.mkdtemp()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env bash
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=================================================
# Utils
#=================================================
set -ex
function init() {
RED='\033[0;31m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NONE='\033[0m'
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
}
function check_style() {
set -e
export PATH=/usr/bin:$PATH
pre-commit install
if ! pre-commit run -a; then
git diff
exit 1
fi
exit 0
}
function main() {
local CMD=$1
init
case $CMD in
check_style)
check_style
;;
*)
echo "build failed"
exit 1
;;
esac
echo "check_style finished as expected"
}
main $@
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import io, re
import sys, os
import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
LANG_COMMENT_MARK = None
NEW_LINE_MARK = None
COPYRIGHT_HEADER = None
if platform.system() == "Windows":
NEW_LINE_MARK = "\r\n"
else:
NEW_LINE_MARK = '\n'
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
date, err = process.communicate()
date = date.decode("utf-8").rstrip("\n")
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
def generate_copyright(template, lang='C'):
if lang == 'Python':
LANG_COMMENT_MARK = '#'
else:
LANG_COMMENT_MARK = "//"
lines = template.split(NEW_LINE_MARK)
BLANK = " "
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
for lino, line in enumerate(lines):
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
if len(line) == 0:
BLANK = ""
else:
BLANK = " "
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
return ans + "\n"
def lang_type(filename):
if filename.endswith(".py"):
return "Python"
elif filename.endswith(".h"):
return "C"
elif filename.endswith(".c"):
return "C"
elif filename.endswith(".hpp"):
return "C"
elif filename.endswith(".cc"):
return "C"
elif filename.endswith(".cpp"):
return "C"
elif filename.endswith(".cu"):
return "C"
elif filename.endswith(".cuh"):
return "C"
elif filename.endswith(".go"):
return "C"
elif filename.endswith(".proto"):
return "C"
else:
print("Unsupported filetype %s", filename)
exit(0)
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
def main(argv=None):
parser = argparse.ArgumentParser(
description='Checker for copyright declaration.')
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
fd = io.open(filename, encoding="utf-8")
first_line = fd.readline()
second_line = fd.readline()
if "COPYRIGHT (C)" in first_line.upper(): continue
if first_line.startswith("#!") or PYTHON_ENCODE.match(
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
continue
original_contents = io.open(filename, encoding="utf-8").read()
new_contents = generate_copyright(
COPYRIGHT, lang_type(filename)) + original_contents
print('Auto Insert Copyright Header {}'.format(filename))
retv = 1
with io.open(filename, 'w') as output_file:
output_file.write(new_contents)
return retv
if __name__ == '__main__':
exit(main())
#!/bin/bash
TOTAL_ERRORS=0
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export PYTHONPATH=$DIR:$PYTHONPATH
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
pylint --disable=all --load-plugins=docstring_checker \
--enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done
exit $TOTAL_ERRORS
#For now, just warning:
#exit 0
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册