diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6d715f5a61674c72082313517cc40f13b1ea85ed --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +repos: +- repo: https://github.com/Lucas-C/pre-commit-hooks.git + sha: v1.0.1 + hooks: + - id: remove-crlf + files: (?!.*third_party)^.*$ | (?!.*book)^.*$ +- repo: https://github.com/PaddlePaddle/mirrors-yapf.git + sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 + hooks: + - id: yapf + files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ +- repo: https://github.com/pre-commit/pre-commit-hooks + sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0 + hooks: + - id: check-added-large-files + - id: check-merge-conflict + - id: check-symlinks + - id: detect-private-key + files: (?!.*third_party)^.*$ | (?!.*book)^.*$ + - id: end-of-file-fixer +- repo: local + hooks: + - id: copyright_checker + name: copyright_checker + entry: python ./tools/codestyle/copyright.hook + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ + exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000000000000000000000000000000000..5b00ebbf73523eb310c16dcef60f78df9ab48156 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,30 @@ +language: generic +sudo: required +dist: trusty + +services: + - docker + +os: + - linux + +env: + - JOB=check_style + +before_install: + # For pylint dockstring checker + - sudo pip install pylint pytest astroid isort pre-commit + - | + function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } + +script: + - "travis_wait 30 sleep 1800 &" + - | + # 43min timeout + tools/build_script.sh ${JOB} + if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi; + +notifications: + email: + on_success: change + on_failure: always diff --git a/core/__init__.py b/core/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/core/__init__.py +++ b/core/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/engine/__init__.py b/core/engine/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/core/engine/__init__.py +++ b/core/engine/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/engine/cluster/__init__.py b/core/engine/cluster/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/core/engine/cluster/__init__.py +++ b/core/engine/cluster/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/engine/cluster/cloud/__init__.py b/core/engine/cluster/cloud/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/core/engine/cluster/cloud/__init__.py +++ b/core/engine/cluster/cloud/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/engine/cluster/cluster.py b/core/engine/cluster/cluster.py index 6dfcec3a929ad8124192014f48270ebd1862dc2c..8c45335799afb165b66c133bd217caf3320f703f 100644 --- a/core/engine/cluster/cluster.py +++ b/core/engine/cluster/cluster.py @@ -27,6 +27,7 @@ from paddlerec.core.utils import envs class ClusterEngine(Engine): def __init_impl__(self): abs_dir = os.path.dirname(os.path.abspath(__file__)) + backend = envs.get_runtime_environ("engine_backend") if backend == "PaddleCloud": self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") @@ -57,4 +58,5 @@ class ClusterEngine(Engine): self.start_worker_procs() else: - raise ValueError("role {} error, must in MASTER/WORKER".format(role)) + raise ValueError("role {} error, must in MASTER/WORKER".format( + role)) diff --git a/core/engine/local_cluster.py b/core/engine/local_cluster.py index 4cf614f02315acbff2a3c21126d8c061c10ba8ad..89ceafa973c9488a727aecb2e01a74f2574a81f9 100755 --- a/core/engine/local_cluster.py +++ b/core/engine/local_cluster.py @@ -46,10 +46,13 @@ class LocalClusterEngine(Engine): ports.append(new_port) break user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) - user_endpoints_ips = [x.split(":")[0] - for x in user_endpoints.split(",")] - user_endpoints_port = [x.split(":")[1] - for x in user_endpoints.split(",")] + + user_endpoints_ips = [ + x.split(":")[0] for x in user_endpoints.split(",") + ] + user_endpoints_port = [ + x.split(":")[1] for x in user_endpoints.split(",") + ] factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] @@ -97,8 +100,10 @@ class LocalClusterEngine(Engine): if len(log_fns) > 0: log_fns[i].close() procs[i].terminate() - print("all workers already completed, you can view logs under the `{}` directory".format(logs_dir), - file=sys.stderr) + print( + "all workers already completed, you can view logs under the `{}` directory". + format(logs_dir), + file=sys.stderr) def run(self): self.start_procs() diff --git a/core/engine/local_mpi.py b/core/engine/local_mpi.py index 49db821fe5764ae9ef7f42cbd3ca2fe77b83a1d1..830bf28c4957e342d317070ab2060cde1de6d6a6 100755 --- a/core/engine/local_mpi.py +++ b/core/engine/local_mpi.py @@ -26,7 +26,6 @@ from paddlerec.core.engine.engine import Engine class LocalMPIEngine(Engine): def start_procs(self): logs_dir = self.envs["log_dir"] - default_env = os.environ.copy() current_env = copy.copy(default_env) current_env.pop("http_proxy", None) @@ -42,7 +41,8 @@ class LocalMPIEngine(Engine): os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/job.log" % logs_dir, "w") log_fns.append(fn) - proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) + proc = subprocess.Popen( + cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) else: proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd()) procs.append(proc) @@ -51,7 +51,9 @@ class LocalMPIEngine(Engine): if len(log_fns) > 0: log_fns[i].close() procs[i].wait() - print("all workers and parameter servers already completed", file=sys.stderr) + print( + "all workers and parameter servers already completed", + file=sys.stderr) def run(self): self.start_procs() diff --git a/core/factory.py b/core/factory.py index 4c08f1f6bbd70cc65011e8430e3acf039d7b6c8f..470b3a025e51d8c9fd6b2b3bcbb118fb8a619d77 100755 --- a/core/factory.py +++ b/core/factory.py @@ -19,24 +19,23 @@ import yaml from paddlerec.core.utils import envs -trainer_abs = os.path.join(os.path.dirname( - os.path.abspath(__file__)), "trainers") +trainer_abs = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "trainers") trainers = {} def trainer_registry(): - trainers["SingleTrainer"] = os.path.join( - trainer_abs, "single_trainer.py") - trainers["ClusterTrainer"] = os.path.join( - trainer_abs, "cluster_trainer.py") - trainers["CtrCodingTrainer"] = os.path.join( - trainer_abs, "ctr_coding_trainer.py") - trainers["CtrModulTrainer"] = os.path.join( - trainer_abs, "ctr_modul_trainer.py") - trainers["TDMSingleTrainer"] = os.path.join( - trainer_abs, "tdm_single_trainer.py") - trainers["TDMClusterTrainer"] = os.path.join( - trainer_abs, "tdm_cluster_trainer.py") + trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py") + trainers["ClusterTrainer"] = os.path.join(trainer_abs, + "cluster_trainer.py") + trainers["CtrCodingTrainer"] = os.path.join(trainer_abs, + "ctr_coding_trainer.py") + trainers["CtrModulTrainer"] = os.path.join(trainer_abs, + "ctr_modul_trainer.py") + trainers["TDMSingleTrainer"] = os.path.join(trainer_abs, + "tdm_single_trainer.py") + trainers["TDMClusterTrainer"] = os.path.join(trainer_abs, + "tdm_cluster_trainer.py") trainer_registry() @@ -55,8 +54,8 @@ class TrainerFactory(object): if trainer_abs is None: if not os.path.isfile(train_mode): - raise IOError( - "trainer {} can not be recognized".format(train_mode)) + raise IOError("trainer {} can not be recognized".format( + train_mode)) trainer_abs = train_mode train_mode = "UserDefineTrainer" diff --git a/core/metrics/auc_metrics.py b/core/metrics/auc_metrics.py index 5dd16cc078aa43d8fb07a50a4b006d4fdae3b2e9..085c84990e4a0a3a3e606ef707fef5d90387e8b0 100755 --- a/core/metrics/auc_metrics.py +++ b/core/metrics/auc_metrics.py @@ -22,7 +22,7 @@ from paddlerec.core.metric import Metric class AUCMetric(Metric): """ - Metric For Paddle Model + Metric For Fluid Model """ def __init__(self, config, fleet): @@ -83,7 +83,8 @@ class AUCMetric(Metric): if scope.find_var(metric_item['var'].name) is None: result[metric_name] = None continue - result[metric_name] = self.get_metric(scope, metric_item['var'].name) + result[metric_name] = self.get_metric(scope, + metric_item['var'].name) return result def calculate_auc(self, global_pos, global_neg): @@ -178,14 +179,18 @@ class AUCMetric(Metric): self._result['mean_q'] = 0 return self._result if 'stat_pos' in result and 'stat_neg' in result: - result['auc'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) - result['bucket_error'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) + result['auc'] = self.calculate_auc(result['stat_pos'], + result['stat_neg']) + result['bucket_error'] = self.calculate_auc(result['stat_pos'], + result['stat_neg']) if 'pos_ins_num' in result: - result['actual_ctr'] = result['pos_ins_num'] / result['total_ins_num'] + result['actual_ctr'] = result['pos_ins_num'] / result[ + 'total_ins_num'] if 'abserr' in result: result['mae'] = result['abserr'] / result['total_ins_num'] if 'sqrerr' in result: - result['rmse'] = math.sqrt(result['sqrerr'] / result['total_ins_num']) + result['rmse'] = math.sqrt(result['sqrerr'] / + result['total_ins_num']) if 'prob' in result: result['predict_ctr'] = result['prob'] / result['total_ins_num'] if abs(result['predict_ctr']) > 1e-6: diff --git a/core/model.py b/core/model.py index 212db44c8dc60a20f6e5ed3f7c338b5336f41e2a..82b41ebc4b7ea752e708b9d7246b6bf7d5025db4 100755 --- a/core/model.py +++ b/core/model.py @@ -20,7 +20,7 @@ from paddlerec.core.utils import envs class Model(object): - """R + """Base Model """ __metaclass__ = abc.ABCMeta @@ -39,32 +39,43 @@ class Model(object): self._platform = envs.get_platform() def _init_slots(self): - sparse_slots = envs.get_global_env("sparse_slots", None, "train.reader") + sparse_slots = envs.get_global_env("sparse_slots", None, + "train.reader") dense_slots = envs.get_global_env("dense_slots", None, "train.reader") if sparse_slots is not None or dense_slots is not None: sparse_slots = sparse_slots.strip().split(" ") dense_slots = dense_slots.strip().split(" ") - dense_slots_shape = [[int(j) for j in i.split(":")[1].strip("[]").split(",")] for i in dense_slots] + dense_slots_shape = [[ + int(j) for j in i.split(":")[1].strip("[]").split(",") + ] for i in dense_slots] dense_slots = [i.split(":")[0] for i in dense_slots] self._dense_data_var = [] for i in range(len(dense_slots)): - l = fluid.layers.data(name=dense_slots[i], shape=dense_slots_shape[i], dtype="float32") + l = fluid.layers.data( + name=dense_slots[i], + shape=dense_slots_shape[i], + dtype="float32") self._data_var.append(l) self._dense_data_var.append(l) self._sparse_data_var = [] for name in sparse_slots: - l = fluid.layers.data(name=name, shape=[1], lod_level=1, dtype="int64") + l = fluid.layers.data( + name=name, shape=[1], lod_level=1, dtype="int64") self._data_var.append(l) self._sparse_data_var.append(l) - dataset_class = envs.get_global_env("dataset_class", None, "train.reader") + dataset_class = envs.get_global_env("dataset_class", None, + "train.reader") if dataset_class == "DataLoader": self._init_dataloader() def _init_dataloader(self): self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def get_inputs(self): return self._data_var @@ -96,8 +107,8 @@ class Model(object): "configured optimizer can only supported SGD/Adam/Adagrad") if name == "SGD": - reg = envs.get_global_env( - "hyper_parameters.reg", 0.0001, self._namespace) + reg = envs.get_global_env("hyper_parameters.reg", 0.0001, + self._namespace) optimizer_i = fluid.optimizer.SGD( lr, regularization=fluid.regularizer.L2DecayRegularizer(reg)) elif name == "ADAM": @@ -111,10 +122,10 @@ class Model(object): return optimizer_i def optimizer(self): - learning_rate = envs.get_global_env( - "hyper_parameters.learning_rate", None, self._namespace) - optimizer = envs.get_global_env( - "hyper_parameters.optimizer", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) + optimizer = envs.get_global_env("hyper_parameters.optimizer", None, + self._namespace) print(">>>>>>>>>>>.learnig rate: %s" % learning_rate) return self._build_optimizer(optimizer, learning_rate) diff --git a/core/modules/__init__.py b/core/modules/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/core/modules/__init__.py +++ b/core/modules/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/coding/__init__.py b/core/modules/coding/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/core/modules/coding/__init__.py +++ b/core/modules/coding/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/coding/layers.py b/core/modules/coding/layers.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/core/modules/coding/layers.py +++ b/core/modules/coding/layers.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/modul/__init__.py b/core/modules/modul/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/core/modules/modul/__init__.py +++ b/core/modules/modul/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/modul/build.py b/core/modules/modul/build.py index 0263cbf60e3b1647a05cbc471b7bbff1840f88ba..dae777176e49831bde4f6f9938637a3289a0a218 100755 --- a/core/modules/modul/build.py +++ b/core/modules/modul/build.py @@ -31,6 +31,7 @@ def create(config): Model Instance """ model = None + if config['mode'] == 'fluid': model = YamlModel(config) model.train_net() @@ -50,7 +51,12 @@ class YamlModel(Model): f = open(config['layer_file'], 'r') self._build_nodes = yaml.safe_load(f.read()) self._build_phase = ['input', 'param', 'summary', 'layer'] - self._build_param = {'layer': {}, 'inner_layer': {}, 'layer_extend': {}, 'model': {}} + self._build_param = { + 'layer': {}, + 'inner_layer': {}, + 'layer_extend': {}, + 'model': {} + } self._inference_meta = {'dependency': {}, 'params': {}} def train_net(self): @@ -76,10 +82,12 @@ class YamlModel(Model): if self._build_nodes[phase] is None: continue for node in self._build_nodes[phase]: - exec("""layer=layer.{}(node)""".format(node['class'])) - layer_output, extend_output = layer.generate(self._config['mode'], self._build_param) + exec ("""layer=layer.{}(node)""".format(node['class'])) + layer_output, extend_output = layer.generate( + self._config['mode'], self._build_param) self._build_param['layer'][node['name']] = layer_output - self._build_param['layer_extend'][node['name']] = extend_output + self._build_param['layer_extend'][node[ + 'name']] = extend_output if extend_output is None: continue if 'loss' in extend_output: @@ -89,17 +97,24 @@ class YamlModel(Model): self._cost += extend_output['loss'] if 'data_var' in extend_output: self._data_var += extend_output['data_var'] - if 'metric_label' in extend_output and extend_output['metric_label'] is not None: - self._metrics[extend_output['metric_label']] = extend_output['metric_dict'] + if 'metric_label' in extend_output and extend_output[ + 'metric_label'] is not None: + self._metrics[extend_output[ + 'metric_label']] = extend_output['metric_dict'] if 'inference_param' in extend_output: inference_param = extend_output['inference_param'] param_name = inference_param['name'] if param_name not in self._build_param['table']: - self._build_param['table'][param_name] = {'params': []} - table_meta = table.TableMeta.alloc_new_table(inference_param['table_id']) - self._build_param['table'][param_name]['_meta'] = table_meta - self._build_param['table'][param_name]['params'] += inference_param['params'] + self._build_param['table'][param_name] = { + 'params': [] + } + table_meta = table.TableMeta.alloc_new_table( + inference_param['table_id']) + self._build_param['table'][param_name][ + '_meta'] = table_meta + self._build_param['table'][param_name][ + 'params'] += inference_param['params'] pass @classmethod @@ -114,20 +129,25 @@ class YamlModel(Model): metrics = params['metrics'] for name in metrics: model_metrics = metrics[name] - stat_var_names += [model_metrics[metric]['var'].name for metric in model_metrics] + stat_var_names += [ + model_metrics[metric]['var'].name + for metric in model_metrics + ] strategy['stat_var_names'] = list(set(stat_var_names)) optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \ '(learning_rate=' + str(optimizer_conf['learning_rate']) + ')' - exec(optimizer_generator) + exec (optimizer_generator) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) return optimizer def dump_model_program(self, path): """R """ - with open(path + '/' + self._name + '_main_program.pbtxt', "w") as fout: + with open(path + '/' + self._name + '_main_program.pbtxt', + "w") as fout: print >> fout, self._build_param['model']['train_program'] - with open(path + '/' + self._name + '_startup_program.pbtxt', "w") as fout: + with open(path + '/' + self._name + '_startup_program.pbtxt', + "w") as fout: print >> fout, self._build_param['model']['startup_program'] pass @@ -137,7 +157,8 @@ class YamlModel(Model): scope = params['scope'] decay = params['decay'] for param_table in self._build_param['table']: - table_id = self._build_param['table'][param_table]['_meta']._table_id + table_id = self._build_param['table'][param_table][ + '_meta']._table_id fleet.shrink_dense_table(decay, scope=scope, table_id=table_id) def dump_inference_program(self, inference_layer, path): @@ -152,17 +173,25 @@ class YamlModel(Model): executor = params['executor'] program = self._build_param['model']['train_program'] for table_name, table in self._build_param['table'].items(): - fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, table['params']) + fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, + table['params']) for infernce_item in params['inference_list']: - params_name_list = self.inference_params(infernce_item['layer_name']) - params_var_list = [program.global_block().var(i) for i in params_name_list] + params_name_list = self.inference_params(infernce_item[ + 'layer_name']) + params_var_list = [ + program.global_block().var(i) for i in params_name_list + ] params_file_name = infernce_item['save_file_name'] with fluid.scope_guard(scope): if params['save_combine']: fluid.io.save_vars(executor, "./", \ program, vars=params_var_list, filename=params_file_name) else: - fluid.io.save_vars(executor, params_file_name, program, vars=params_var_list) + fluid.io.save_vars( + executor, + params_file_name, + program, + vars=params_var_list) def inference_params(self, inference_layer): """ @@ -177,11 +206,13 @@ class YamlModel(Model): return self._inference_meta['params'][layer] self._inference_meta['params'][layer] = [] - self._inference_meta['dependency'][layer] = self.get_dependency(self._build_param['inner_layer'], layer) + self._inference_meta['dependency'][layer] = self.get_dependency( + self._build_param['inner_layer'], layer) for node in self._build_nodes['layer']: if node['name'] not in self._inference_meta['dependency'][layer]: continue - if 'inference_param' in self._build_param['layer_extend'][node['name']]: + if 'inference_param' in self._build_param['layer_extend'][node[ + 'name']]: self._inference_meta['params'][layer] += \ self._build_param['layer_extend'][node['name']]['inference_param']['params'] return self._inference_meta['params'][layer] @@ -199,5 +230,6 @@ class YamlModel(Model): dependencys = copy.deepcopy(layer_graph[dest_layer]['input']) dependency_list = copy.deepcopy(dependencys) for dependency in dependencys: - dependency_list = dependency_list + self.get_dependency(layer_graph, dependency) + dependency_list = dependency_list + self.get_dependency( + layer_graph, dependency) return list(set(dependency_list)) diff --git a/core/modules/modul/layers.py b/core/modules/modul/layers.py index 060c023ff5c29c1823f5a47dac6233f3a34a1f09..008ce6e40987a6a3adf6605590aa2b8fe53f034a 100755 --- a/core/modules/modul/layers.py +++ b/core/modules/modul/layers.py @@ -18,7 +18,7 @@ from paddlerec.core.layer import Layer class EmbeddingFuseLayer(Layer): - """R + """embedding + sequence + concat """ def __init__(self, config): @@ -40,7 +40,8 @@ class EmbeddingFuseLayer(Layer): show_clk.stop_gradient = True data_var = [] for slot in self._slots: - l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) + l = fluid.layers.data( + name=slot, shape=[1], dtype="int64", lod_level=1) data_var.append(l) emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \ is_sparse=True, is_distributed=True, @@ -48,7 +49,8 @@ class EmbeddingFuseLayer(Layer): emb = fluid.layers.sequence_pool(input=emb, pool_type='sum') emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm) self._emb_layers.append(emb) - output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name) + output = fluid.layers.concat( + input=self._emb_layers, axis=1, name=self._name) return output, {'data_var': data_var} @@ -111,7 +113,13 @@ class ParamLayer(Layer): def generate(self, param): """R """ - return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}} + return self._config, { + 'inference_param': { + 'name': 'param', + 'params': [], + 'table_id': self._table_id + } + } class SummaryLayer(Layer): @@ -129,7 +137,13 @@ class SummaryLayer(Layer): def generate(self, param): """R """ - return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}} + return self._config, { + 'inference_param': { + 'name': 'summary', + 'params': [], + 'table_id': self._table_id + } + } class NormalizationLayer(Layer): @@ -152,9 +166,19 @@ class NormalizationLayer(Layer): if len(self._input) > 0: input_list = [param['layer'][i] for i in self._input] input_layer = fluid.layers.concat(input=input_list, axis=1) - bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={ - "batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4}) - inference_param = [self._name + '.batch_size', self._name + '.batch_sum', self._name + '.batch_square_sum'] + bn = fluid.layers.data_norm( + input=input_layer, + name=self._name, + epsilon=1e-4, + param_attr={ + "batch_size": 1e4, + "batch_sum_default": 0.0, + "batch_square": 1e4 + }) + inference_param = [ + self._name + '.batch_size', self._name + '.batch_sum', + self._name + '.batch_square_sum' + ] return bn, {'inference_param': {'name': 'summary', \ 'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}} @@ -181,11 +205,13 @@ class FCLayer(Layer): input_list = [param['layer'][i] for i in self._input] input_layer = fluid.layers.concat(input=input_list, axis=1) input_coln = input_layer.shape[1] - scale = param_layer['init_range'] / (input_coln ** 0.5) + scale = param_layer['init_range'] / (input_coln**0.5) bias = None if self._bias: - bias = fluid.ParamAttr(learning_rate=1.0, - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale)) + bias = fluid.ParamAttr( + learning_rate=1.0, + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)) fc = fluid.layers.fc( name=self._name, input=input_layer, @@ -216,18 +242,46 @@ class LogLossLayer(Layer): self._extend_output = { 'metric_label': self._metric_label, 'metric_dict': { - 'auc': {'var': None}, - 'batch_auc': {'var': None}, - 'stat_pos': {'var': None, 'data_type': 'int64'}, - 'stat_neg': {'var': None, 'data_type': 'int64'}, - 'batch_stat_pos': {'var': None, 'data_type': 'int64'}, - 'batch_stat_neg': {'var': None, 'data_type': 'int64'}, - 'pos_ins_num': {'var': None}, - 'abserr': {'var': None}, - 'sqrerr': {'var': None}, - 'prob': {'var': None}, - 'total_ins_num': {'var': None}, - 'q': {'var': None} + 'auc': { + 'var': None + }, + 'batch_auc': { + 'var': None + }, + 'stat_pos': { + 'var': None, + 'data_type': 'int64' + }, + 'stat_neg': { + 'var': None, + 'data_type': 'int64' + }, + 'batch_stat_pos': { + 'var': None, + 'data_type': 'int64' + }, + 'batch_stat_neg': { + 'var': None, + 'data_type': 'int64' + }, + 'pos_ins_num': { + 'var': None + }, + 'abserr': { + 'var': None + }, + 'sqrerr': { + 'var': None + }, + 'prob': { + 'var': None + }, + 'total_ins_num': { + 'var': None + }, + 'q': { + 'var': None + } } } @@ -236,9 +290,12 @@ class LogLossLayer(Layer): """ input_layer = param['layer'][self._input[0]] label_layer = param['layer'][self._label] - output = fluid.layers.clip(input_layer, self._bound[0], self._bound[1], name=self._name) + output = fluid.layers.clip( + input_layer, self._bound[0], self._bound[1], name=self._name) norm = fluid.layers.sigmoid(output, name=self._name) - output = fluid.layers.log_loss(norm, fluid.layers.cast(x=label_layer, dtype='float32')) + output = fluid.layers.log_loss( + norm, fluid.layers.cast( + x=label_layer, dtype='float32')) if self._weight: weight_layer = param['layer'][self._weight] output = fluid.layers.elementwise_mul(output, weight_layer) @@ -248,7 +305,11 @@ class LogLossLayer(Layer): # For AUC Metric metric = self._extend_output['metric_dict'] binary_predict = fluid.layers.concat( - input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1) + input=[ + fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), + norm + ], + axis=1) metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \ metric['batch_stat_neg']['var'], metric['stat_pos']['var'], metric['stat_neg']['var']] = \ diff --git a/core/reader.py b/core/reader.py index 01502761e30a7215c0c916dcde1825a4836280db..85c0c4f9a57eea194343a6e1af6bfad2d07dd5a0 100755 --- a/core/reader.py +++ b/core/reader.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -import sys import abc import os @@ -64,7 +64,11 @@ class SlotReader(dg.MultiSlotDataGenerator): from operator import mul self.sparse_slots = sparse_slots.strip().split(" ") self.dense_slots = dense_slots.strip().split(" ") - self.dense_slots_shape = [reduce(mul, [int(j) for j in i.split(":")[1].strip("[]").split(",")]) for i in self.dense_slots] + self.dense_slots_shape = [ + reduce(mul, + [int(j) for j in i.split(":")[1].strip("[]").split(",")]) + for i in self.dense_slots + ] self.dense_slots = [i.split(":")[0] for i in self.dense_slots] self.slots = self.dense_slots + self.sparse_slots self.slot2index = {} @@ -93,10 +97,13 @@ class SlotReader(dg.MultiSlotDataGenerator): slot = i if not self.visit[slot]: if i in self.dense_slots: - output[self.slot2index[i]][1].extend([self.padding] * self.dense_slots_shape[self.slot2index[i]]) + output[self.slot2index[i]][1].extend( + [self.padding] * + self.dense_slots_shape[self.slot2index[i]]) else: output[self.slot2index[i]][1].extend([self.padding]) else: self.visit[slot] = False yield output + return reader diff --git a/core/trainer.py b/core/trainer.py index 40fc35de973ce7841bfdf28dfc6c6a3751484be7..b7c22ea89bd279a2e1e233edeb4d8cf11b8aa5c0 100755 --- a/core/trainer.py +++ b/core/trainer.py @@ -30,8 +30,10 @@ class Trainer(object): def __init__(self, config=None): self._status_processor = {} + self._place = fluid.CPUPlace() self._exe = fluid.Executor(self._place) + self._exector_context = {} self._context = {'status': 'uninit', 'is_exit': False} self._config_yaml = config @@ -95,6 +97,6 @@ def user_define_engine(engine_yaml): train_dirname = os.path.dirname(train_location) base_name = os.path.splitext(os.path.basename(train_location))[0] sys.path.append(train_dirname) - trainer_class = envs.lazy_instance_by_fliename( - base_name, "UserDefineTraining") + trainer_class = envs.lazy_instance_by_fliename(base_name, + "UserDefineTraining") return trainer_class diff --git a/core/trainers/__init__.py b/core/trainers/__init__.py index cd9c9db5e6b93fd6171bca0a5b0f97f69306aedc..f14704cad8f3859746f95353ba68753f857ff78d 100755 --- a/core/trainers/__init__.py +++ b/core/trainers/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ trainer implement. @@ -22,5 +21,3 @@ Trainer ↘ (for online learning training) OnlineLearningTrainer """ - - diff --git a/core/trainers/cluster_trainer.py b/core/trainers/cluster_trainer.py index faa960359bc82d6130302002a99fb664c7374249..792b897f779b82a0989d6c25dd79663d52d05abd 100755 --- a/core/trainers/cluster_trainer.py +++ b/core/trainers/cluster_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ @@ -43,11 +42,14 @@ class ClusterTrainer(TranspileTrainer): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) - if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": + + if envs.get_platform() == "LINUX" and envs.get_global_env( + "dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: - self.regist_context_processor( - 'train_pass', self.dataloader_train) + self.regist_context_processor('train_pass', + self.dataloader_train) + self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('terminal_pass', self.terminal) @@ -75,8 +77,8 @@ class ClusterTrainer(TranspileTrainer): def init(self, context): self.model.train_net() optimizer = self.model.optimizer() - optimizer_name = envs.get_global_env( - "hyper_parameters.optimizer", None, "train.model") + optimizer_name = envs.get_global_env("hyper_parameters.optimizer", + None, "train.model") if optimizer_name not in ["", "sgd", "SGD", "Sgd"]: os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0' @@ -114,9 +116,9 @@ class ClusterTrainer(TranspileTrainer): program = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( - loss_name=self.model.get_avg_cost().name, - build_strategy=self.strategy.get_build_strategy(), - exec_strategy=self.strategy.get_execute_strategy()) + loss_name=self.model.get_avg_cost().name, + build_strategy=self.strategy.get_build_strategy(), + exec_strategy=self.strategy.get_execute_strategy()) metrics_varnames = [] metrics_format = [] @@ -135,9 +137,8 @@ class ClusterTrainer(TranspileTrainer): batch_id = 0 try: while True: - metrics_rets = self._exe.run( - program=program, - fetch_list=metrics_varnames) + metrics_rets = self._exe.run(program=program, + fetch_list=metrics_varnames) metrics = [epoch, batch_id] metrics.extend(metrics_rets) @@ -162,14 +163,16 @@ class ClusterTrainer(TranspileTrainer): for i in range(epochs): begin_time = time.time() - self._exe.train_from_dataset(program=fluid.default_main_program(), - dataset=dataset, - fetch_list=self.fetch_vars, - fetch_info=self.fetch_alias, - print_period=self.fetch_period) + self._exe.train_from_dataset( + program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) end_time = time.time() - times = end_time-begin_time - print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) + times = end_time - begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format( + i, times, ins / times)) self.save(i, "train", is_fleet=True) fleet.stop_worker() diff --git a/core/trainers/ctr_coding_trainer.py b/core/trainers/ctr_coding_trainer.py new file mode 100755 index 0000000000000000000000000000000000000000..7dc51f340147aec933ce8bffd0be080b7be984c6 --- /dev/null +++ b/core/trainers/ctr_coding_trainer.py @@ -0,0 +1,142 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet +from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker + +from paddlerec.core.utils import envs +from paddlerec.core.trainer import Trainer + + +class CtrTrainer(Trainer): + """R + """ + + def __init__(self, config): + """R + """ + Trainer.__init__(self, config) + + self.global_config = config + self._metrics = {} + self.processor_register() + + def processor_register(self): + role = MPISymetricRoleMaker() + fleet.init(role) + + if fleet.is_server(): + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('server_pass', self.server) + else: + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('train_pass', self.train) + self.regist_context_processor('terminal_pass', self.terminal) + + def _get_dataset(self): + namespace = "train.reader" + + inputs = self.model.get_inputs() + threads = envs.get_global_env("train.threads", None) + batch_size = envs.get_global_env("batch_size", None, namespace) + reader_class = envs.get_global_env("class", None, namespace) + abs_dir = os.path.dirname(os.path.abspath(__file__)) + reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", + self._config_yaml) + train_data_path = envs.get_global_env("train_data_path", None, + namespace) + + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_use_var(inputs) + dataset.set_pipe_command(pipe_cmd) + dataset.set_batch_size(batch_size) + dataset.set_thread(threads) + file_list = [ + os.path.join(train_data_path, x) + for x in os.listdir(train_data_path) + ] + + dataset.set_filelist(file_list) + return dataset + + def instance(self, context): + models = envs.get_global_env("train.model.models") + model_class = envs.lazy_instance_by_fliename(models, "Model") + self.model = model_class(None) + context['status'] = 'init_pass' + + def init(self, context): + """R + """ + self.model.train_net() + optimizer = self.model.optimizer() + + optimizer = fleet.distributed_optimizer( + optimizer, strategy={"use_cvm": False}) + optimizer.minimize(self.model.get_avg_cost()) + + if fleet.is_server(): + context['status'] = 'server_pass' + else: + self.fetch_vars = [] + self.fetch_alias = [] + self.fetch_period = self.model.get_fetch_period() + + metrics = self.model.get_metrics() + if metrics: + self.fetch_vars = metrics.values() + self.fetch_alias = metrics.keys() + context['status'] = 'train_pass' + + def server(self, context): + fleet.run_server() + fleet.stop_worker() + context['is_exit'] = True + + def train(self, context): + self._exe.run(fluid.default_startup_program()) + fleet.init_worker() + + dataset = self._get_dataset() + + shuf = np.array([fleet.worker_index()]) + gs = shuf * 0 + fleet._role_maker._node_type_comm.Allreduce(shuf, gs) + + print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index( + ), fleet.worker_num(), gs)) + + epochs = envs.get_global_env("train.epochs") + + for i in range(epochs): + self._exe.train_from_dataset( + program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) + + context['status'] = 'terminal_pass' + fleet.stop_worker() + + def terminal(self, context): + print("terminal ended.") + context['is_exit'] = True diff --git a/core/trainers/ctr_modul_trainer.py b/core/trainers/ctr_modul_trainer.py new file mode 100755 index 0000000000000000000000000000000000000000..af8f3f3a2c3fb59fc6db60e3e4cd050ca3d8ad8a --- /dev/null +++ b/core/trainers/ctr_modul_trainer.py @@ -0,0 +1,534 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import json +import sys +import time + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet +from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker + +from paddlerec.core.utils import fs as fs +from paddlerec.core.utils import util as util +from paddlerec.core.metrics.auc_metrics import AUCMetric +from paddlerec.core.modules.modul import build as model_basic +from paddlerec.core.utils import dataset +from paddlerec.core.trainer import Trainer + + +def wroker_numric_opt(value, env, opt): + """ + numric count opt for workers + Args: + value: value for count + env: mpi/gloo + opt: count operator, SUM/MAX/MIN/AVG + Return: + count result + """ + local_value = np.array([value]) + global_value = np.copy(local_value) * 0 + fleet._role_maker.all_reduce_worker(local_value, global_value, opt) + return global_value[0] + + +def worker_numric_sum(value, env="mpi"): + """R + """ + return wroker_numric_opt(value, env, "sum") + + +def worker_numric_avg(value, env="mpi"): + """R + """ + return worker_numric_sum(value, env) / fleet.worker_num() + + +def worker_numric_min(value, env="mpi"): + """R + """ + return wroker_numric_opt(value, env, "min") + + +def worker_numric_max(value, env="mpi"): + """R + """ + return wroker_numric_opt(value, env, "max") + + +class CtrTrainer(Trainer): + """R + """ + + def __init__(self, config): + """R + """ + Trainer.__init__(self, config) + config['output_path'] = util.get_absolute_path(config['output_path'], + config['io']['afs']) + + self.global_config = config + self._metrics = {} + + self._path_generator = util.PathGenerator({ + 'templates': [{ + 'name': 'xbox_base_done', + 'template': config['output_path'] + '/xbox_base_done.txt' + }, { + 'name': 'xbox_delta_done', + 'template': config['output_path'] + '/xbox_patch_done.txt' + }, { + 'name': 'xbox_base', + 'template': config['output_path'] + '/xbox/{day}/base/' + }, { + 'name': 'xbox_delta', + 'template': + config['output_path'] + '/xbox/{day}/delta-{pass_id}/' + }, { + 'name': 'batch_model', + 'template': + config['output_path'] + '/batch_model/{day}/{pass_id}/' + }] + }) + if 'path_generator' in config: + self._path_generator.add_path_template(config['path_generator']) + + self.regist_context_processor('uninit', self.init) + self.regist_context_processor('startup', self.startup) + self.regist_context_processor('begin_day', self.begin_day) + self.regist_context_processor('train_pass', self.train_pass) + self.regist_context_processor('end_day', self.end_day) + + def init(self, context): + """R + """ + role_maker = None + if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu': + afs_config = self.global_config['io']['afs'] + role_maker = GeneralRoleMaker( + hdfs_name=afs_config['fs_name'], + hdfs_ugi=afs_config['fs_ugi'], + path=self.global_config['output_path'] + "/gloo", + init_timeout_seconds=1200, + run_timeout_seconds=1200) + fleet.init(role_maker) + data_var_list = [] + data_var_name_dict = {} + runnnable_scope = [] + runnnable_cost_op = [] + context['status'] = 'startup' + + for executor in self.global_config['executor']: + scope = fluid.Scope() + self._exector_context[executor['name']] = {} + self._exector_context[executor['name']]['scope'] = scope + self._exector_context[executor['name']][ + 'model'] = model_basic.create(executor) + model = self._exector_context[executor['name']]['model'] + self._metrics.update(model.get_metrics()) + runnnable_scope.append(scope) + runnnable_cost_op.append(model.get_avg_cost()) + for var in model._data_var: + if var.name in data_var_name_dict: + continue + data_var_list.append(var) + data_var_name_dict[var.name] = var + + optimizer = model_basic.YamlModel.build_optimizer({ + 'metrics': self._metrics, + 'optimizer_conf': self.global_config['optimizer'] + }) + optimizer.minimize(runnnable_cost_op, runnnable_scope) + for executor in self.global_config['executor']: + scope = self._exector_context[executor['name']]['scope'] + model = self._exector_context[executor['name']]['model'] + program = model._build_param['model']['train_program'] + if not executor['is_update_sparse']: + program._fleet_opt["program_configs"][str( + id(model.get_avg_cost().block.program))][ + "push_sparse"] = [] + if 'train_thread_num' not in executor: + executor['train_thread_num'] = self.global_config[ + 'train_thread_num'] + with fluid.scope_guard(scope): + self._exe.run(model._build_param['model']['startup_program']) + model.dump_model_program('./') + + # server init done + if fleet.is_server(): + return 0 + + self._dataset = {} + for dataset_item in self.global_config['dataset']['data_list']: + dataset_item['data_vars'] = data_var_list + dataset_item.update(self.global_config['io']['afs']) + dataset_item["batch_size"] = self.global_config['batch_size'] + self._dataset[dataset_item[ + 'name']] = dataset.FluidTimeSplitDataset(dataset_item) + # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass: + # util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3) + fleet.init_worker() + pass + + def print_log(self, log_str, params): + """R + """ + params['index'] = fleet.worker_index() + if params['master']: + if fleet.worker_index() == 0: + print(log_str) + sys.stdout.flush() + else: + print(log_str) + if 'stdout' in params: + params['stdout'] += str(datetime.datetime.now()) + log_str + + def print_global_metrics(self, scope, model, monitor_data, stdout_str): + """R + """ + metrics = model.get_metrics() + metric_calculator = AUCMetric(None) + for metric in metrics: + metric_param = {'label': metric, 'metric_dict': metrics[metric]} + metric_calculator.calculate(scope, metric_param) + metric_result = metric_calculator.get_result_to_string() + self.print_log(metric_result, + {'master': True, + 'stdout': stdout_str}) + monitor_data += metric_result + metric_calculator.clear(scope, metric_param) + + def save_model(self, day, pass_index, base_key): + """R + """ + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'save model cost %s sec' + }) + model_path = self._path_generator.generate_path( + 'batch_model', {'day': day, + 'pass_id': pass_index}) + save_mode = 0 # just save all + if pass_index < 1: # batch_model + save_mode = 3 # unseen_day++, save all + util.rank0_print("going to save_model %s" % model_path) + fleet.save_persistables(None, model_path, mode=save_mode) + if fleet._role_maker.is_first_worker(): + self._train_pass.save_train_progress( + day, pass_index, base_key, model_path, is_checkpoint=True) + cost_printer.done() + return model_path + + def save_xbox_model(self, day, pass_index, xbox_base_key, monitor_data): + """R + """ + stdout_str = "" + xbox_patch_id = str(int(time.time())) + util.rank0_print("begin save delta model") + + model_path = "" + xbox_model_donefile = "" + cost_printer = util.CostPrinter(util.print_cost, {'master': True, \ + 'log_format': 'save xbox model cost %s sec', + 'stdout': stdout_str}) + if pass_index < 1: + save_mode = 2 + xbox_patch_id = xbox_base_key + model_path = self._path_generator.generate_path('xbox_base', + {'day': day}) + xbox_model_donefile = self._path_generator.generate_path( + 'xbox_base_done', {'day': day}) + else: + save_mode = 1 + model_path = self._path_generator.generate_path( + 'xbox_delta', {'day': day, + 'pass_id': pass_index}) + xbox_model_donefile = self._path_generator.generate_path( + 'xbox_delta_done', {'day': day}) + total_save_num = fleet.save_persistables( + None, model_path, mode=save_mode) + cost_printer.done() + + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'save cache model cost %s sec', + 'stdout': stdout_str + }) + model_file_handler = fs.FileHandler(self.global_config['io']['afs']) + if self.global_config['save_cache_model']: + cache_save_num = fleet.save_cache_model( + None, model_path, mode=save_mode) + model_file_handler.write( + "file_prefix:part\npart_num:16\nkey_num:%d\n" % cache_save_num, + model_path + '/000_cache/sparse_cache.meta', 'w') + cost_printer.done() + util.rank0_print("save xbox cache model done, key_num=%s" % + cache_save_num) + + save_env_param = {'executor': self._exe, 'save_combine': True} + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'save dense model cost %s sec', + 'stdout': stdout_str + }) + if fleet._role_maker.is_first_worker(): + for executor in self.global_config['executor']: + if 'layer_for_inference' not in executor: + continue + executor_name = executor['name'] + model = self._exector_context[executor_name]['model'] + save_env_param['inference_list'] = executor[ + 'layer_for_inference'] + save_env_param['scope'] = self._exector_context[executor_name][ + 'scope'] + model.dump_inference_param(save_env_param) + for dnn_layer in executor['layer_for_inference']: + model_file_handler.cp(dnn_layer['save_file_name'], + model_path + '/dnn_plugin/' + + dnn_layer['save_file_name']) + fleet._role_maker._barrier_worker() + cost_printer.done() + + xbox_done_info = { + "id": xbox_patch_id, + "key": xbox_base_key, + "ins_path": "", + "ins_tag": "feasign", + "partition_type": "2", + "record_count": "111111", + "monitor_data": monitor_data, + "mpi_size": str(fleet.worker_num()), + "input": model_path.rstrip("/") + "/000", + "job_id": util.get_env_value("JOB_ID"), + "job_name": util.get_env_value("JOB_NAME") + } + if fleet._role_maker.is_first_worker(): + model_file_handler.write( + json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a') + if pass_index > 0: + self._train_pass.save_train_progress( + day, + pass_index, + xbox_base_key, + model_path, + is_checkpoint=False) + fleet._role_maker._barrier_worker() + return stdout_str + + def run_executor(self, executor_config, dataset, stdout_str): + """R + """ + day = self._train_pass.date() + pass_id = self._train_pass._pass_id + xbox_base_key = self._train_pass._base_key + executor_name = executor_config['name'] + scope = self._exector_context[executor_name]['scope'] + model = self._exector_context[executor_name]['model'] + with fluid.scope_guard(scope): + util.rank0_print("Begin " + executor_name + " pass") + begin = time.time() + program = model._build_param['model']['train_program'] + self._exe.train_from_dataset( + program, + dataset, + scope, + thread=executor_config['train_thread_num'], + debug=self.global_config['debug']) + end = time.time() + local_cost = (end - begin) / 60.0 + avg_cost = worker_numric_avg(local_cost) + min_cost = worker_numric_min(local_cost) + max_cost = worker_numric_max(local_cost) + util.rank0_print("avg train time %s mins, min %s mins, max %s mins" + % (avg_cost, min_cost, max_cost)) + self._exector_context[executor_name]['cost'] = max_cost + + monitor_data = "" + self.print_global_metrics(scope, model, monitor_data, stdout_str) + util.rank0_print("End " + executor_name + " pass") + if self._train_pass.need_dump_inference( + pass_id) and executor_config['dump_inference_model']: + stdout_str += self.save_xbox_model(day, pass_id, xbox_base_key, + monitor_data) + fleet._role_maker._barrier_worker() + + def startup(self, context): + """R + """ + if fleet.is_server(): + fleet.run_server() + context['status'] = 'wait' + return + stdout_str = "" + self._train_pass = util.TimeTrainPass(self.global_config) + if not self.global_config['cold_start']: + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'load model cost %s sec', + 'stdout': stdout_str + }) + self.print_log("going to load model %s" % + self._train_pass._checkpoint_model_path, + {'master': True}) + # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= self._train_pass.date() + # and config.reqi_dnn_plugin_pass >= self._pass_id: + # fleet.load_one_table(0, self._train_pass._checkpoint_model_path) + # else: + fleet.init_server(self._train_pass._checkpoint_model_path, mode=0) + cost_printer.done() + if self.global_config['save_first_base']: + self.print_log("save_first_base=True", {'master': True}) + self.print_log("going to save xbox base model", + {'master': True, + 'stdout': stdout_str}) + self._train_pass._base_key = int(time.time()) + stdout_str += self.save_xbox_model(self._train_pass.date(), 0, + self._train_pass._base_key, "") + context['status'] = 'begin_day' + + def begin_day(self, context): + """R + """ + stdout_str = "" + if not self._train_pass.next(): + context['is_exit'] = True + day = self._train_pass.date() + pass_id = self._train_pass._pass_id + self.print_log("======== BEGIN DAY:%s ========" % day, + {'master': True, + 'stdout': stdout_str}) + if pass_id == self._train_pass.max_pass_num_day(): + context['status'] = 'end_day' + else: + context['status'] = 'train_pass' + + def end_day(self, context): + """R + """ + day = self._train_pass.date() + pass_id = self._train_pass._pass_id + xbox_base_key = int(time.time()) + context['status'] = 'begin_day' + + util.rank0_print("shrink table") + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'shrink table done, cost %s sec' + }) + fleet.shrink_sparse_table() + for executor in self._exector_context: + self._exector_context[executor]['model'].shrink({ + 'scope': self._exector_context[executor]['scope'], + 'decay': self.global_config['optimizer']['dense_decay_rate'] + }) + cost_printer.done() + + next_date = self._train_pass.date(delta_day=1) + util.rank0_print("going to save xbox base model") + self.save_xbox_model(next_date, 0, xbox_base_key, "") + util.rank0_print("going to save batch model") + self.save_model(next_date, 0, xbox_base_key) + self._train_pass._base_key = xbox_base_key + fleet._role_maker._barrier_worker() + + def train_pass(self, context): + """R + """ + stdout_str = "" + day = self._train_pass.date() + pass_id = self._train_pass._pass_id + base_key = self._train_pass._base_key + pass_time = self._train_pass._current_train_time.strftime("%Y%m%d%H%M") + self.print_log(" ==== begin delta:%s ========" % pass_id, + {'master': True, + 'stdout': stdout_str}) + train_begin_time = time.time() + + cost_printer = util.CostPrinter(util.print_cost, \ + {'master': True, 'log_format': 'load into memory done, cost %s sec', + 'stdout': stdout_str}) + current_dataset = {} + for name in self._dataset: + current_dataset[name] = self._dataset[name].load_dataset({ + 'node_num': fleet.worker_num(), + 'node_idx': fleet.worker_index(), + 'begin_time': pass_time, + 'time_window_min': self._train_pass._interval_per_pass + }) + fleet._role_maker._barrier_worker() + cost_printer.done() + + util.rank0_print("going to global shuffle") + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'stdout': stdout_str, + 'log_format': 'global shuffle done, cost %s sec' + }) + for name in current_dataset: + current_dataset[name].global_shuffle( + fleet, self.global_config['dataset']['shuffle_thread']) + cost_printer.done() + # str(dataset.get_shuffle_data_size(fleet)) + fleet._role_maker._barrier_worker() + + if self.global_config['prefetch_data']: + next_pass_time = ( + self._train_pass._current_train_time + datetime.timedelta( + minutes=self._train_pass._interval_per_pass) + ).strftime("%Y%m%d%H%M") + for name in self._dataset: + self._dataset[name].preload_dataset({ + 'node_num': fleet.worker_num(), + 'node_idx': fleet.worker_index(), + 'begin_time': next_pass_time, + 'time_window_min': self._train_pass._interval_per_pass + }) + + fleet._role_maker._barrier_worker() + pure_train_begin = time.time() + for executor in self.global_config['executor']: + self.run_executor(executor, + current_dataset[executor['dataset_name']], + stdout_str) + cost_printer = util.CostPrinter(util.print_cost, \ + {'master': True, 'log_format': 'release_memory cost %s sec'}) + for name in current_dataset: + current_dataset[name].release_memory() + pure_train_cost = time.time() - pure_train_begin + + if self._train_pass.is_checkpoint_pass(pass_id): + self.save_model(day, pass_id, base_key) + + train_end_time = time.time() + train_cost = train_end_time - train_begin_time + other_cost = train_cost - pure_train_cost + log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % ( + day, pass_id, train_cost) + for executor in self._exector_context: + log_str += '[' + executor + ':' + str(self._exector_context[ + executor]['cost']) + ']' + log_str += '[other_cost:' + str(other_cost) + ']' + util.rank0_print(log_str) + stdout_str += util.now_time_str() + log_str + sys.stdout.write(stdout_str) + fleet._role_maker._barrier_worker() + stdout_str = "" + if pass_id == self._train_pass.max_pass_num_day(): + context['status'] = 'end_day' + return + elif not self._train_pass.next(): + context['is_exit'] = True diff --git a/core/trainers/online_learning_trainer.py b/core/trainers/online_learning_trainer.py index 0303e96ac0bb20b1f46cdc9f5836d18fa73b9a8e..b285684464ed2cd1d8bfd7710d6f28d30de3f936 100755 --- a/core/trainers/online_learning_trainer.py +++ b/core/trainers/online_learning_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ @@ -44,11 +43,14 @@ class OnlineLearningTrainer(TranspileTrainer): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) - if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": + + if envs.get_platform() == "LINUX" and envs.get_global_env( + "dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: - self.regist_context_processor( - 'train_pass', self.dataloader_train) + self.regist_context_processor('train_pass', + self.dataloader_train) + self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('terminal_pass', self.terminal) @@ -110,27 +112,27 @@ class OnlineLearningTrainer(TranspileTrainer): if state == "TRAIN": inputs = self.model.get_inputs() namespace = "train.reader" - train_data_path = envs.get_global_env( - "train_data_path", None, namespace) + train_data_path = envs.get_global_env("train_data_path", None, + namespace) else: inputs = self.model.get_infer_inputs() namespace = "evaluate.reader" - train_data_path = envs.get_global_env( - "test_data_path", None, namespace) + train_data_path = envs.get_global_env("test_data_path", None, + namespace) threads = int(envs.get_runtime_environ("train.trainer.threads")) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') - pipe_cmd = "python {} {} {} {}".format( - reader, reader_class, state, self._config_yaml) + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state, + self._config_yaml) if train_data_path.startswith("paddlerec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None - train_data_path = os.path.join( - package_base, train_data_path.split("::")[1]) + train_data_path = os.path.join(package_base, + train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) @@ -166,14 +168,16 @@ class OnlineLearningTrainer(TranspileTrainer): ins = self._get_dataset_ins() begin_time = time.time() - self._exe.train_from_dataset(program=fluid.default_main_program(), - dataset=dataset, - fetch_list=self.fetch_vars, - fetch_info=self.fetch_alias, - print_period=self.fetch_period) + self._exe.train_from_dataset( + program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) end_time = time.time() - times = end_time-begin_time - print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) + times = end_time - begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format( + i, times, ins / times)) self.save(i, "train", is_fleet=True) fleet.stop_worker() diff --git a/core/trainers/single_trainer.py b/core/trainers/single_trainer.py index 8079377ba257041e4946d6e452cacaa388ca36ce..a564ba5585c313a163542f028fa158f8c50c8d2a 100755 --- a/core/trainers/single_trainer.py +++ b/core/trainers/single_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ @@ -36,8 +35,9 @@ class SingleTrainer(TranspileTrainer): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) - if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, - "train.reader") != "DataLoader": + + if envs.get_platform() == "LINUX" and envs.get_global_env( + "dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: self.regist_context_processor('train_pass', self.dataloader_train) @@ -73,9 +73,8 @@ class SingleTrainer(TranspileTrainer): reader = self._get_dataloader("TRAIN") epochs = envs.get_global_env("train.epochs") - program = fluid.compiler.CompiledProgram( - fluid.default_main_program()).with_data_parallel( - loss_name=self.model.get_avg_cost().name) + program = fluid.compiler.CompiledProgram(fluid.default_main_program( + )).with_data_parallel(loss_name=self.model.get_avg_cost().name) metrics_varnames = [] metrics_format = [] @@ -94,9 +93,8 @@ class SingleTrainer(TranspileTrainer): batch_id = 0 try: while True: - metrics_rets = self._exe.run( - program=program, - fetch_list=metrics_varnames) + metrics_rets = self._exe.run(program=program, + fetch_list=metrics_varnames) metrics = [epoch, batch_id] metrics.extend(metrics_rets) @@ -117,14 +115,16 @@ class SingleTrainer(TranspileTrainer): epochs = envs.get_global_env("train.epochs") for i in range(epochs): begin_time = time.time() - self._exe.train_from_dataset(program=fluid.default_main_program(), - dataset=dataset, - fetch_list=self.fetch_vars, - fetch_info=self.fetch_alias, - print_period=self.fetch_period) + self._exe.train_from_dataset( + program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) end_time = time.time() times = end_time - begin_time - print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times)) + print("epoch {} using time {}, speed {:.2f} lines/s".format( + i, times, ins / times)) self.save(i, "train", is_fleet=False) context['status'] = 'infer_pass' diff --git a/core/trainers/tdm_cluster_trainer.py b/core/trainers/tdm_cluster_trainer.py index 3bd1ad3367f340019333e8f83cf5abdd3b36b25f..a7e8f97e446bc266a733fc12a798c505ee4d9ec5 100755 --- a/core/trainers/tdm_cluster_trainer.py +++ b/core/trainers/tdm_cluster_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ @@ -36,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"] class TDMClusterTrainer(ClusterTrainer): def server(self, context): namespace = "train.startup" - init_model_path = envs.get_global_env( - "cluster.init_model_path", "", namespace) + init_model_path = envs.get_global_env("cluster.init_model_path", "", + namespace) assert init_model_path != "", "Cluster train must has init_model for TDM" fleet.init_server(init_model_path) logger.info("TDM: load model from {}".format(init_model_path)) @@ -48,24 +47,27 @@ class TDMClusterTrainer(ClusterTrainer): self._exe.run(fleet.startup_program) namespace = "train.startup" - load_tree = envs.get_global_env( - "tree.load_tree", True, namespace) - self.tree_layer_path = envs.get_global_env( - "tree.tree_layer_path", "", namespace) - self.tree_travel_path = envs.get_global_env( - "tree.tree_travel_path", "", namespace) - self.tree_info_path = envs.get_global_env( - "tree.tree_info_path", "", namespace) - - save_init_model = envs.get_global_env( - "cluster.save_init_model", False, namespace) - init_model_path = envs.get_global_env( - "cluster.init_model_path", "", namespace) + load_tree = envs.get_global_env("tree.load_tree", True, namespace) + + self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "", + namespace) + + self.tree_travel_path = envs.get_global_env("tree.tree_travel_path", + "", namespace) + + self.tree_info_path = envs.get_global_env("tree.tree_info_path", "", + namespace) + + save_init_model = envs.get_global_env("cluster.save_init_model", False, + namespace) + init_model_path = envs.get_global_env("cluster.init_model_path", "", + namespace) if load_tree: # covert tree to tensor, set it into Fluid's variable. for param_name in special_param: - param_t = fluid.global_scope().find_var(param_name).get_tensor() + param_t = fluid.global_scope().find_var(param_name).get_tensor( + ) param_array = self._tdm_prepare(param_name) param_t.set(param_array.astype('int32'), self._place) @@ -93,8 +95,8 @@ class TDMClusterTrainer(ClusterTrainer): def _tdm_travel_prepare(self): """load tdm tree param from npy/list file""" travel_array = np.load(self.tree_travel_path) - logger.info("TDM Tree leaf node nums: {}".format( - travel_array.shape[0])) + logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[ + 0])) return travel_array def _tdm_layer_prepare(self): diff --git a/core/trainers/tdm_single_trainer.py b/core/trainers/tdm_single_trainer.py index 21be66a677750f6e817b63794819b14ed72d9fa2..c0f23fc361e907ca5732a3531fc7c460ddc5aad3 100755 --- a/core/trainers/tdm_single_trainer.py +++ b/core/trainers/tdm_single_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ @@ -27,33 +26,38 @@ from paddlerec.core.utils import envs logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) -special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", - "TDM_Tree_Info", "TDM_Tree_Emb"] +special_param = [ + "TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info", "TDM_Tree_Emb" +] class TDMSingleTrainer(SingleTrainer): def startup(self, context): namespace = "train.startup" - load_persistables = envs.get_global_env( - "single.load_persistables", False, namespace) + load_persistables = envs.get_global_env("single.load_persistables", + False, namespace) + persistables_model_path = envs.get_global_env( "single.persistables_model_path", "", namespace) - load_tree = envs.get_global_env( - "tree.load_tree", False, namespace) - self.tree_layer_path = envs.get_global_env( - "tree.tree_layer_path", "", namespace) - self.tree_travel_path = envs.get_global_env( - "tree.tree_travel_path", "", namespace) - self.tree_info_path = envs.get_global_env( - "tree.tree_info_path", "", namespace) - self.tree_emb_path = envs.get_global_env( - "tree.tree_emb_path", "", namespace) - - save_init_model = envs.get_global_env( - "single.save_init_model", False, namespace) - init_model_path = envs.get_global_env( - "single.init_model_path", "", namespace) + load_tree = envs.get_global_env("tree.load_tree", False, namespace) + + self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "", + namespace) + + self.tree_travel_path = envs.get_global_env("tree.tree_travel_path", + "", namespace) + + self.tree_info_path = envs.get_global_env("tree.tree_info_path", "", + namespace) + + self.tree_emb_path = envs.get_global_env("tree.tree_emb_path", "", + namespace) + + save_init_model = envs.get_global_env("single.save_init_model", False, + namespace) + init_model_path = envs.get_global_env("single.init_model_path", "", + namespace) self._exe.run(fluid.default_startup_program()) if load_persistables: @@ -68,7 +72,8 @@ class TDMSingleTrainer(SingleTrainer): if load_tree: # covert tree to tensor, set it into Fluid's variable. for param_name in special_param: - param_t = fluid.global_scope().find_var(param_name).get_tensor() + param_t = fluid.global_scope().find_var(param_name).get_tensor( + ) param_array = self._tdm_prepare(param_name) if param_name == 'TDM_Tree_Emb': param_t.set(param_array.astype('float32'), self._place) @@ -102,15 +107,15 @@ class TDMSingleTrainer(SingleTrainer): def _tdm_travel_prepare(self): """load tdm tree param from npy/list file""" travel_array = np.load(self.tree_travel_path) - logger.info("TDM Tree leaf node nums: {}".format( - travel_array.shape[0])) + logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[ + 0])) return travel_array def _tdm_emb_prepare(self): """load tdm tree param from npy/list file""" emb_array = np.load(self.tree_emb_path) - logger.info("TDM Tree node nums from emb: {}".format( - emb_array.shape[0])) + logger.info("TDM Tree node nums from emb: {}".format(emb_array.shape[ + 0])) return emb_array def _tdm_layer_prepare(self): diff --git a/core/trainers/transpiler_trainer.py b/core/trainers/transpiler_trainer.py index a67d4759be7ae27c4a8c57eb43409102a8400c53..c121b4abb624503936faca8e77902a97e3f0cf82 100755 --- a/core/trainers/transpiler_trainer.py +++ b/core/trainers/transpiler_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with DistributeTranspiler """ @@ -39,9 +38,12 @@ class TranspileTrainer(Trainer): self.increment_models = [] def processor_register(self): - print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first") + print( + "Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first" + ) def _get_dataloader(self, state="TRAIN"): + if state == "TRAIN": dataloader = self.model._data_loader namespace = "train.reader" @@ -59,12 +61,14 @@ class TranspileTrainer(Trainer): if sparse_slots is None and dense_slots is None: reader_class = envs.get_global_env("class", None, namespace) - reader = dataloader_instance.dataloader( - reader_class, state, self._config_yaml) - reader_class = envs.lazy_instance_by_fliename(reader_class, class_name) + reader = dataloader_instance.dataloader(reader_class, state, + self._config_yaml) + reader_class = envs.lazy_instance_by_fliename(reader_class, + class_name) reader_ins = reader_class(self._config_yaml) else: - reader = dataloader_instance.slotdataloader("", state, self._config_yaml) + reader = dataloader_instance.slotdataloader("", state, + self._config_yaml) reader_ins = SlotReader(self._config_yaml) if hasattr(reader_ins, 'generate_batch_from_trainfiles'): @@ -94,13 +98,13 @@ class TranspileTrainer(Trainer): if state == "TRAIN": inputs = self.model.get_inputs() namespace = "train.reader" - train_data_path = envs.get_global_env( - "train_data_path", None, namespace) + train_data_path = envs.get_global_env("train_data_path", None, + namespace) else: inputs = self.model.get_infer_inputs() namespace = "evaluate.reader" - train_data_path = envs.get_global_env( - "test_data_path", None, namespace) + train_data_path = envs.get_global_env("test_data_path", None, + namespace) sparse_slots = envs.get_global_env("sparse_slots", None, namespace) dense_slots = envs.get_global_env("dense_slots", None, namespace) @@ -112,8 +116,8 @@ class TranspileTrainer(Trainer): reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') if sparse_slots is None and dense_slots is None: - pipe_cmd = "python {} {} {} {}".format( - reader, reader_class, state, self._config_yaml) + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state, + self._config_yaml) else: padding = envs.get_global_env("padding", 0, namespace) pipe_cmd = "python {} {} {} {} {} {} {} {}".format( @@ -123,8 +127,8 @@ class TranspileTrainer(Trainer): if train_data_path.startswith("paddlerec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None - train_data_path = os.path.join( - package_base, train_data_path.split("::")[1]) + train_data_path = os.path.join(package_base, + train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) @@ -140,11 +144,11 @@ class TranspileTrainer(Trainer): debug_mode = envs.get_global_env("reader_debug_mode", False, namespace) if debug_mode: - print( - "--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0])) + print("--- Dataset Debug Mode Begin , show pre 10 data of {}---". + format(file_list[0])) os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd)) - print( - "--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0])) + print("--- Dataset Debug Mode End , show pre 10 data of {}---". + format(file_list[0])) exit(0) return dataset @@ -166,27 +170,29 @@ class TranspileTrainer(Trainer): if not need_save(epoch_id, save_interval, False): return - feed_varnames = envs.get_global_env( - "save.inference.feed_varnames", None, namespace) + feed_varnames = envs.get_global_env("save.inference.feed_varnames", + None, namespace) fetch_varnames = envs.get_global_env( "save.inference.fetch_varnames", None, namespace) if feed_varnames is None or fetch_varnames is None: return - fetch_vars = [fluid.default_main_program().global_block().vars[varname] - for varname in fetch_varnames] - dirname = envs.get_global_env( - "save.inference.dirname", None, namespace) + fetch_vars = [ + fluid.default_main_program().global_block().vars[varname] + for varname in fetch_varnames + ] + dirname = envs.get_global_env("save.inference.dirname", None, + namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) if is_fleet: - fleet.save_inference_model( - self._exe, dirname, feed_varnames, fetch_vars) + fleet.save_inference_model(self._exe, dirname, feed_varnames, + fetch_vars) else: - fluid.io.save_inference_model( - dirname, feed_varnames, fetch_vars, self._exe) + fluid.io.save_inference_model(dirname, feed_varnames, + fetch_vars, self._exe) self.inference_models.append((epoch_id, dirname)) def save_persistables(): @@ -196,8 +202,8 @@ class TranspileTrainer(Trainer): if not need_save(epoch_id, save_interval, False): return - dirname = envs.get_global_env( - "save.increment.dirname", None, namespace) + dirname = envs.get_global_env("save.increment.dirname", None, + namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) @@ -275,10 +281,9 @@ class TranspileTrainer(Trainer): batch_id = 0 try: while True: - metrics_rets = self._exe.run( - program=program, - fetch_list=metrics_varnames, - return_numpy=is_return_numpy) + metrics_rets = self._exe.run(program=program, + fetch_list=metrics_varnames, + return_numpy=is_return_numpy) metrics = [epoch, batch_id] metrics.extend(metrics_rets) diff --git a/core/utils/dataset_holder.py b/core/utils/dataset_holder.py index cd195450336cac0265f76670ca0e3fa24c45a7ba..a75d52b60440f924acbb45ff7ff9125eaa121e36 100755 --- a/core/utils/dataset_holder.py +++ b/core/utils/dataset_holder.py @@ -24,7 +24,7 @@ from paddlerec.core.utils import util as util class DatasetHolder(object): """ - Dataset Base + Dataset Holder """ __metaclass__ = abc.ABCMeta @@ -74,11 +74,17 @@ class TimeSplitDatasetHolder(DatasetHolder): Dataset.__init__(self, config) if 'data_donefile' not in config or config['data_donefile'] is None: config['data_donefile'] = config['data_path'] + "/to.hadoop.done" - self._path_generator = util.PathGenerator({'templates': [ - {'name': 'data_path', 'template': config['data_path']}, - {'name': 'donefile_path', 'template': config['data_donefile']} - ]}) - self._split_interval = config['split_interval'] # data split N mins per dir + self._path_generator = util.PathGenerator({ + 'templates': [{ + 'name': 'data_path', + 'template': config['data_path'] + }, { + 'name': 'donefile_path', + 'template': config['data_donefile'] + }] + }) + self._split_interval = config[ + 'split_interval'] # data split N mins per dir self._data_file_handler = fs.FileHandler(config) def _format_data_time(self, daytime_str, time_window_mins): @@ -91,7 +97,8 @@ class TimeSplitDatasetHolder(DatasetHolder): return None, 0 if mins_of_day % self._split_interval != 0: - skip_mins = self._split_interval - (mins_of_day % self._split_interval) + skip_mins = self._split_interval - (mins_of_day % + self._split_interval) data_time = data_time + datetime.timedelta(minutes=skip_mins) time_window_mins = time_window_mins - skip_mins return data_time, time_window_mins @@ -106,17 +113,24 @@ class TimeSplitDatasetHolder(DatasetHolder): True/False """ is_ready = True - data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) + data_time, windows_mins = self._format_data_time(daytime_str, + time_window_mins) while time_window_mins > 0: - file_path = self._path_generator.generate_path('donefile_path', {'time_format': data_time}) + file_path = self._path_generator.generate_path( + 'donefile_path', {'time_format': data_time}) if not self._data_file_handler.is_exist(file_path): is_ready = False break time_window_mins = time_window_mins - self._split_interval - data_time = data_time + datetime.timedelta(minutes=self._split_interval) + data_time = data_time + datetime.timedelta( + minutes=self._split_interval) return is_ready - def get_file_list(self, daytime_str, time_window_mins, node_num=1, node_idx=0): + def get_file_list(self, + daytime_str, + time_window_mins, + node_num=1, + node_idx=0): """ data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx] Args: @@ -128,26 +142,32 @@ class TimeSplitDatasetHolder(DatasetHolder): list, data_shard[node_idx] """ data_file_list = [] - data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) + data_time, windows_mins = self._format_data_time(daytime_str, + time_window_mins) while time_window_mins > 0: - file_path = self._path_generator.generate_path('data_path', {'time_format': data_time}) + file_path = self._path_generator.generate_path( + 'data_path', {'time_format': data_time}) sub_file_list = self._data_file_handler.ls(file_path) for sub_file in sub_file_list: sub_file_name = self._data_file_handler.get_file_name(sub_file) - if not sub_file_name.startswith(self._config['filename_prefix']): + if not sub_file_name.startswith(self._config[ + 'filename_prefix']): continue if hash(sub_file_name) % node_num == node_idx: data_file_list.append(sub_file) time_window_mins = time_window_mins - self._split_interval - data_time = data_time + datetime.timedelta(minutes=self._split_interval) + data_time = data_time + datetime.timedelta( + minutes=self._split_interval) return data_file_list def _alloc_dataset(self, file_list): """ """ - dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type']) + dataset = fluid.DatasetFactory().create_dataset(self._config[ + 'dataset_type']) dataset.set_batch_size(self._config['batch_size']) dataset.set_thread(self._config['load_thread']) - dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi']) + dataset.set_hdfs_config(self._config['fs_name'], + self._config['fs_ugi']) dataset.set_pipe_command(self._config['data_converter']) dataset.set_filelist(file_list) dataset.set_use_var(self._config['data_vars']) @@ -163,7 +183,9 @@ class TimeSplitDatasetHolder(DatasetHolder): while self.check_ready(begin_time, windown_min) == False: print("dataset not ready, time:" + begin_time) time.sleep(30) - file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) + file_list = self.get_file_list(begin_time, windown_min, + params['node_num'], + params['node_idx']) self._datasets[begin_time] = self._alloc_dataset(file_list) self._datasets[begin_time].load_into_memory() else: @@ -176,9 +198,12 @@ class TimeSplitDatasetHolder(DatasetHolder): windown_min = params['time_window_min'] if begin_time not in self._datasets: if self.check_ready(begin_time, windown_min): - file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) + file_list = self.get_file_list(begin_time, windown_min, + params['node_num'], + params['node_idx']) self._datasets[begin_time] = self._alloc_dataset(file_list) - self._datasets[begin_time].preload_into_memory(self._config['preload_thread']) + self._datasets[begin_time].preload_into_memory(self._config[ + 'preload_thread']) return True return False diff --git a/core/utils/dataset_instance.py b/core/utils/dataset_instance.py index f5175c48df978919c51519d027561011bd3ceb44..2e6082dc5e381b6ac2fc46f7fb6fbe73d4214b69 100755 --- a/core/utils/dataset_instance.py +++ b/core/utils/dataset_instance.py @@ -17,10 +17,11 @@ import sys from paddlerec.core.utils.envs import lazy_instance_by_fliename from paddlerec.core.reader import SlotReader -from paddlerec.core.utils import envs if len(sys.argv) < 4: - raise ValueError("reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path") + raise ValueError( + "reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path" + ) reader_package = sys.argv[1] diff --git a/core/utils/envs.py b/core/utils/envs.py index 7093d897e780c525e91516a0058bc90319d4e918..bc222e906448435031024281a0a80298073d3979 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -95,7 +95,7 @@ def path_adapter(path): l_p = path.split("paddlerec.")[1].replace(".", "/") return os.path.join(package, l_p) else: - return path + return path def windows_path_converter(path): @@ -159,8 +159,8 @@ def pretty_print_envs(envs, header=None): def lazy_instance_by_package(package, class_name): models = get_global_env("train.model.models") - model_package = __import__( - package, globals(), locals(), package.split(".")) + model_package = __import__(package, + globals(), locals(), package.split(".")) instance = getattr(model_package, class_name) return instance @@ -170,8 +170,8 @@ def lazy_instance_by_fliename(abs, class_name): sys.path.append(dirname) package = os.path.splitext(os.path.basename(abs))[0] - model_package = __import__( - package, globals(), locals(), package.split(".")) + model_package = __import__(package, + globals(), locals(), package.split(".")) instance = getattr(model_package, class_name) return instance @@ -189,8 +189,7 @@ def get_platform(): def find_free_port(): def __free_port(): - with closing(socket.socket(socket.AF_INET, - socket.SOCK_STREAM)) as s: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(('', 0)) return s.getsockname()[1] diff --git a/core/utils/fs.py b/core/utils/fs.py index 836c6f598b9c423b0922e30f536a669c55e83098..fab84496c5761e4214f4e5bb3666960408abf68c 100755 --- a/core/utils/fs.py +++ b/core/utils/fs.py @@ -18,7 +18,7 @@ from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient def is_afs_path(path): - """R + """is_afs_path """ if path.startswith("afs") or path.startswith("hdfs"): return True @@ -133,8 +133,9 @@ class FileHandler(object): if mode.find('a') >= 0: org_content = self._hdfs_client.cat(dest_path) content = content + org_content - self._local_fs_client.write(content, temp_local_file, - mode) # fleet hdfs_client only support upload, so write tmp file + self._local_fs_client.write( + content, temp_local_file, mode + ) # fleet hdfs_client only support upload, so write tmp file self._hdfs_client.delete(dest_path + ".tmp") self._hdfs_client.upload(dest_path + ".tmp", temp_local_file) self._hdfs_client.delete(dest_path + ".bak") @@ -158,7 +159,8 @@ class FileHandler(object): files = [] if is_afs_path(path): files = self._hdfs_client.ls(path) - files = [path + '/' + self.get_file_name(fi) for fi in files] # absulte path + files = [path + '/' + self.get_file_name(fi) + for fi in files] # absulte path else: files = self._local_fs_client.ls(path) files = [path + '/' + fi for fi in files] # absulte path diff --git a/core/utils/util.py b/core/utils/util.py index bd63284873b6c6be80c9849f40535cebe1b7fb14..34f26c6d113faf4739ff621d1087da475414c46f 100755 --- a/core/utils/util.py +++ b/core/utils/util.py @@ -22,6 +22,7 @@ from paddlerec.core.utils import fs as fs def save_program_proto(path, program=None): + if program is None: _program = fluid.default_main_program() else: @@ -175,7 +176,8 @@ class PathGenerator(object): """ if template_name in self._templates: if 'time_format' in param: - str = param['time_format'].strftime(self._templates[template_name]) + str = param['time_format'].strftime(self._templates[ + template_name]) return str.format(**param) return self._templates[template_name].format(**param) else: @@ -198,31 +200,39 @@ class TimeTrainPass(object): self._begin_day = make_datetime(day_fields[0].strip()) if len(day_fields) == 1 or len(day_fields[1]) == 0: # 100 years, meaning to continuous running - self._end_day = self._begin_day + datetime.timedelta(days=36500) + self._end_day = self._begin_day + datetime.timedelta( + days=36500) else: # example: 2020212+10 run_day = int(day_fields[1].strip()) - self._end_day = self._begin_day + datetime.timedelta(days=run_day) + self._end_day = self._begin_day + datetime.timedelta( + days=run_day) else: # example: {20191001..20191031} - days = os.popen("echo -n " + self._config['days']).read().split(" ") + days = os.popen("echo -n " + self._config['days']).read().split( + " ") self._begin_day = make_datetime(days[0]) self._end_day = make_datetime(days[len(days) - 1]) self._checkpoint_interval = self._config['checkpoint_interval'] self._dump_inference_interval = self._config['dump_inference_interval'] - self._interval_per_pass = self._config['train_time_interval'] # train N min data per pass + self._interval_per_pass = self._config[ + 'train_time_interval'] # train N min data per pass self._pass_id = 0 self._inference_pass_id = 0 self._pass_donefile_handler = None if 'pass_donefile_name' in self._config: - self._train_pass_donefile = global_config['output_path'] + '/' + self._config['pass_donefile_name'] + self._train_pass_donefile = global_config[ + 'output_path'] + '/' + self._config['pass_donefile_name'] if fs.is_afs_path(self._train_pass_donefile): - self._pass_donefile_handler = fs.FileHandler(global_config['io']['afs']) + self._pass_donefile_handler = fs.FileHandler(global_config[ + 'io']['afs']) else: - self._pass_donefile_handler = fs.FileHandler(global_config['io']['local_fs']) + self._pass_donefile_handler = fs.FileHandler(global_config[ + 'io']['local_fs']) - last_done = self._pass_donefile_handler.cat(self._train_pass_donefile).strip().split('\n')[-1] + last_done = self._pass_donefile_handler.cat( + self._train_pass_donefile).strip().split('\n')[-1] done_fileds = last_done.split('\t') if len(done_fileds) > 4: self._base_key = done_fileds[1] @@ -236,15 +246,18 @@ class TimeTrainPass(object): """ return 24 * 60 / self._interval_per_pass - def save_train_progress(self, day, pass_id, base_key, model_path, is_checkpoint): + def save_train_progress(self, day, pass_id, base_key, model_path, + is_checkpoint): """R """ if is_checkpoint: self._checkpoint_pass_id = pass_id self._checkpoint_model_path = model_path - done_content = "%s\t%s\t%s\t%s\t%d\n" % (day, base_key, - self._checkpoint_model_path, self._checkpoint_pass_id, pass_id) - self._pass_donefile_handler.write(done_content, self._train_pass_donefile, 'a') + done_content = "%s\t%s\t%s\t%s\t%d\n" % ( + day, base_key, self._checkpoint_model_path, + self._checkpoint_pass_id, pass_id) + self._pass_donefile_handler.write(done_content, + self._train_pass_donefile, 'a') pass def init_pass_by_id(self, date_str, pass_id): @@ -286,12 +299,14 @@ class TimeTrainPass(object): if self._pass_id < 1: self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M")) else: - next_time = self._current_train_time + datetime.timedelta(minutes=self._interval_per_pass) + next_time = self._current_train_time + datetime.timedelta( + minutes=self._interval_per_pass) if (next_time - self._end_day).total_seconds() > 0: has_next = False else: self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M")) - if has_next and (self._inference_pass_id < self._pass_id or self._pass_id < old_pass_id): + if has_next and (self._inference_pass_id < self._pass_id or + self._pass_id < old_pass_id): self._inference_pass_id = self._pass_id - 1 return has_next @@ -319,9 +334,11 @@ class TimeTrainPass(object): Return: date(current_train_time + delta_day) """ - return (self._current_train_time + datetime.timedelta(days=delta_day)).strftime("%Y%m%d") + return (self._current_train_time + datetime.timedelta(days=delta_day) + ).strftime("%Y%m%d") def timestamp(self, delta_day=0): """R """ - return (self._current_train_time + datetime.timedelta(days=delta_day)).timestamp() + return (self._current_train_time + datetime.timedelta(days=delta_day) + ).timestamp() diff --git a/doc/__init__.py b/doc/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/doc/__init__.py +++ b/doc/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/doc/benchmark.md b/doc/benchmark.md index b16e26c71888d590f00f13782449acf840c4b6ee..2aaea25d6941043d24fcee31c7117a3b34c4f525 100644 --- a/doc/benchmark.md +++ b/doc/benchmark.md @@ -1,2 +1,2 @@ # PaddleRec Benchmark -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/contribute.md b/doc/contribute.md index a9bd1910021e78573f1d9fd99f66404b77927737..26770d8ac0b64e9835f7398768d0f81de9383132 100644 --- a/doc/contribute.md +++ b/doc/contribute.md @@ -1,2 +1,2 @@ # PaddleRec 贡献代码 -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/design.md b/doc/design.md index 2975d77f14e461547921f74b9ced5cf73703e2e7..a442bd16a25301178538f482cd537a4ca23bc395 100644 --- a/doc/design.md +++ b/doc/design.md @@ -279,4 +279,4 @@ class Metric(object): pass ``` -全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py) \ No newline at end of file +全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py) diff --git a/doc/distributed_train.md b/doc/distributed_train.md index 425f141ab76e173a9484dff90cd5cfb55acaf853..339c5a83ffd26f9416a67a02390a11ba4c87c29d 100644 --- a/doc/distributed_train.md +++ b/doc/distributed_train.md @@ -7,5 +7,3 @@ ### K8S集群运行分布式 > 占位 - - diff --git a/doc/faq.md b/doc/faq.md index f7ca7cc4a7c366a2a828496eae3f12d1dea17b7e..60790140877b6b11add29552e02c0a435da75f87 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -1,2 +1,2 @@ # 常见问题FAQ -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/local_train.md b/doc/local_train.md index 4a43fa5520ca745badc4d2a49710763eac6e7a0a..e65255ebf7e14933f52f9977b2ecec48dabbb76e 100644 --- a/doc/local_train.md +++ b/doc/local_train.md @@ -1,2 +1,2 @@ # PaddleRec 单机训练 -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/model_list.md b/doc/model_list.md index 9e68d9f6d2f8e9361cc13b9e76f28426062943bc..b46687a60475fbd309f01050194510b21b060f17 100644 --- a/doc/model_list.md +++ b/doc/model_list.md @@ -12,4 +12,3 @@ | 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ | - diff --git a/doc/optimization_model.md b/doc/optimization_model.md index b516f8958053b1b2bd6982b71c699e8baf69f8d9..e63f45b62b50db55f1c6c0d48c7ca23b016b74d3 100644 --- a/doc/optimization_model.md +++ b/doc/optimization_model.md @@ -1,2 +1,2 @@ # PaddleRec 模型调参 -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/predict.md b/doc/predict.md index a33eda43ec6aed8ebe628f0540327b707055970d..07160e1f0e7563276c33e514d006dd3747492f90 100644 --- a/doc/predict.md +++ b/doc/predict.md @@ -1 +1 @@ -# PaddleRec 离线预测 \ No newline at end of file +# PaddleRec 离线预测 diff --git a/doc/ps_background.md b/doc/ps_background.md index 984e1b00c96242843cceaf68cf15bb8deb52c391..e5f2e320940763986351fefd21a5e1f1363b6104 100644 --- a/doc/ps_background.md +++ b/doc/ps_background.md @@ -5,4 +5,3 @@ ## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839) - diff --git a/models/contentunderstanding/__init__.py b/models/contentunderstanding/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/contentunderstanding/__init__.py +++ b/models/contentunderstanding/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/contentunderstanding/classification/config.yaml b/models/contentunderstanding/classification/config.yaml index d1748137f0c4d994b3a566debf43dbdc2c3d66dc..ef55cd18e8fd45829acd2f479c661f27decfda71 100644 --- a/models/contentunderstanding/classification/config.yaml +++ b/models/contentunderstanding/classification/config.yaml @@ -37,4 +37,3 @@ train: dirname: "inference" epoch_interval: 100 save_last: True - diff --git a/models/contentunderstanding/classification/model.py b/models/contentunderstanding/classification/model.py index 9e853aa01d4a0b6bd5c7a20d8e13164bd9905ad0..23c51d44d7d839d9db30f8129c3e42449a6a80d4 100644 --- a/models/contentunderstanding/classification/model.py +++ b/models/contentunderstanding/classification/model.py @@ -31,7 +31,8 @@ class Model(ModelBase): def train_net(self): """ network definition """ - data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64') + data = fluid.data( + name="input", shape=[None, self.max_len], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') @@ -51,7 +52,9 @@ class Model(ModelBase): # full connect layer fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim) # softmax layer - prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax") + prediction = fluid.layers.fc(input=[fc_1], + size=self.class_dim, + act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) diff --git a/models/contentunderstanding/classification/reader.py b/models/contentunderstanding/classification/reader.py index 136a5668856c0fb558a016a3bc3a0b8a56651d3b..1c8e86cdb49f1cc89c9c4f413cbd7b117b55aa55 100644 --- a/models/contentunderstanding/classification/reader.py +++ b/models/contentunderstanding/classification/reader.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import sys from paddlerec.core.reader import Reader @@ -38,7 +37,8 @@ class TrainReader(Reader): data = [int(i) for i in data] label = [int(i) for i in label] seq_len = [int(i) for i in seq_len] - print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) + print >> sys.stderr, str( + [('data', data), ('label', label), ('seq_len', seq_len)]) yield [('data', data), ('label', label), ('seq_len', seq_len)] return data_iter diff --git a/models/contentunderstanding/readme.md b/models/contentunderstanding/readme.md index 07ec96f2414881998617f048c1e82a3f0d9cda75..deefbd2eb02f08d7fac810eb40ae78ff1a173baf 100644 --- a/models/contentunderstanding/readme.md +++ b/models/contentunderstanding/readme.md @@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification | :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: | | ag news dataset | TagSpace | -- | -- | -- | -- | | -- | Classification | -- | -- | -- | -- | - diff --git a/models/contentunderstanding/tagspace/config.yaml b/models/contentunderstanding/tagspace/config.yaml index 70333fcbf7edf4b6b5f54145e29cb122ed3ae9c6..19fbf277d66445c44287856512cb0b13777dc251 100644 --- a/models/contentunderstanding/tagspace/config.yaml +++ b/models/contentunderstanding/tagspace/config.yaml @@ -47,4 +47,3 @@ train: dirname: "inference" epoch_interval: 100 save_last: True - diff --git a/models/contentunderstanding/tagspace/model.py b/models/contentunderstanding/tagspace/model.py index 033d51b8f5d50ddcb1199f566b679eff61acfccb..2948d2e3d5f4a5d5afdbb9744f235b5db59e6bae 100644 --- a/models/contentunderstanding/tagspace/model.py +++ b/models/contentunderstanding/tagspace/model.py @@ -26,8 +26,10 @@ class Model(ModelBase): ModelBase.__init__(self, config) self.cost = None self.metrics = {} - self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace) - self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace) + self.vocab_text_size = envs.get_global_env("vocab_text_size", None, + self._namespace) + self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, + self._namespace) self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace) self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace) self.win_size = envs.get_global_env("win_size", None, self._namespace) @@ -35,8 +37,9 @@ class Model(ModelBase): self.neg_size = envs.get_global_env("neg_size", None, self._namespace) def train_net(self): - """ network definition """ - text = fluid.data(name="text", shape=[None, 1], lod_level=1, dtype='int64') + """ network""" + text = fluid.data( + name="text", shape=[None, 1], lod_level=1, dtype='int64') pos_tag = fluid.data( name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') neg_tag = fluid.data( @@ -45,13 +48,19 @@ class Model(ModelBase): self._data_var = [text, pos_tag, neg_tag] text_emb = fluid.embedding( - input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb") + input=text, + size=[self.vocab_text_size, self.emb_dim], + param_attr="text_emb") text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) pos_tag_emb = fluid.embedding( - input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") + input=pos_tag, + size=[self.vocab_tag_size, self.emb_dim], + param_attr="tag_emb") pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) neg_tag_emb = fluid.embedding( - input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") + input=neg_tag, + size=[self.vocab_tag_size, self.emb_dim], + param_attr="tag_emb") neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) conv_1d = fluid.nets.sequence_conv_pool( @@ -65,7 +74,8 @@ class Model(ModelBase): size=self.emb_dim, param_attr="text_hid") cos_pos = nn.cos_sim(pos_tag_emb, text_hid) - mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) + mul_text_hid = fluid.layers.sequence_expand_as( + x=text_hid, y=neg_tag_emb) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) cos_neg_all = fluid.layers.sequence_reshape( input=mul_cos_neg, new_dim=self.neg_size) @@ -74,7 +84,10 @@ class Model(ModelBase): #calculate hinge loss loss_part1 = nn.elementwise_sub( tensor.fill_constant_batch_size_like( - input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'), + input=cos_pos, + shape=[-1, 1], + value=self.margin, + dtype='float32'), cos_pos) loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part3 = nn.elementwise_max( @@ -85,7 +98,7 @@ class Model(ModelBase): less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') correct = nn.reduce_sum(less) self.cost = avg_cost - + self.metrics["correct"] = correct self.metrics["cos_pos"] = cos_pos @@ -96,7 +109,8 @@ class Model(ModelBase): return self.metrics def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, + self._namespace) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) return sgd_optimizer diff --git a/models/contentunderstanding/tagspace/reader.py b/models/contentunderstanding/tagspace/reader.py index 0f63b85fd1a322b55c6d0e451fe61ff90c82eaa5..3bf704f17adbafc28302ec0b64180ec3fddf6d01 100644 --- a/models/contentunderstanding/tagspace/reader.py +++ b/models/contentunderstanding/tagspace/reader.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import sys import numpy as np diff --git a/models/match/__init__.py b/models/match/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/match/__init__.py +++ b/models/match/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/match/dssm/model.py b/models/match/dssm/model.py index 630fb3eeef062bdfda7720c2c54dd884ec033a71..05d6f762cb266b4cbe40c9a972aafe1885af5b86 100755 --- a/models/match/dssm/model.py +++ b/models/match/dssm/model.py @@ -23,13 +23,26 @@ class Model(ModelBase): ModelBase.__init__(self, config) def input(self): - TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) - Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace) - - self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) - self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) - self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i - in range(Neg)] + TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, + self._namespace) + + Neg = envs.get_global_env("hyper_parameters.NEG", None, + self._namespace) + + self.query = fluid.data( + name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) + self.doc_pos = fluid.data( + name="doc_pos", + shape=[-1, TRIGRAM_D], + dtype='float32', + lod_level=0) + self.doc_negs = [ + fluid.data( + name="doc_neg_" + str(i), + shape=[-1, TRIGRAM_D], + dtype="float32", + lod_level=0) for i in range(Neg) + ] self._data_var.append(self.query) self._data_var.append(self.doc_pos) for input in self.doc_negs: @@ -37,16 +50,24 @@ class Model(ModelBase): if self._platform != "LINUX": self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def net(self, is_infer=False): - hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) - hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace) + hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, + self._namespace) + hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, + self._namespace) def fc(data, hidden_layers, hidden_acts, names): fc_inputs = [data] for i in range(len(hidden_layers)): - xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i]) + xavier = fluid.initializer.Xavier( + uniform=True, + fan_in=fc_inputs[-1].shape[1], + fan_out=hidden_layers[i]) out = fluid.layers.fc(input=fc_inputs[-1], size=hidden_layers[i], act=hidden_acts[i], @@ -56,8 +77,10 @@ class Model(ModelBase): fc_inputs.append(out) return fc_inputs[-1] - query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3']) - doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) + query_fc = fc(self.query, hidden_layers, hidden_acts, + ['query_l1', 'query_l2', 'query_l3']) + doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, + ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc) if is_infer: @@ -65,13 +88,17 @@ class Model(ModelBase): R_Q_D_ns = [] for i, doc_neg in enumerate(self.doc_negs): - doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, - ['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)]) + doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [ + 'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), + 'doc_neg_l3_' + str(i) + ]) R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i)) - concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1) + concat_Rs = fluid.layers.concat( + input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1) prob = fluid.layers.softmax(concat_Rs, axis=1) - hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1]) + hit_prob = fluid.layers.slice( + prob, axes=[0, 1], starts=[0, 0], ends=[4, 1]) loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob)) self.avg_cost = fluid.layers.mean(x=loss) @@ -91,18 +118,28 @@ class Model(ModelBase): self.metrics() def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.SGD(learning_rate) return optimizer def infer_input(self): - TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) - self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) - self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) + TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, + self._namespace) + self.query = fluid.data( + name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) + self.doc_pos = fluid.data( + name="doc_pos", + shape=[-1, TRIGRAM_D], + dtype='float32', + lod_level=0) self._infer_data_var = [self.query, self.doc_pos] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def infer_net(self): self.infer_input() diff --git a/models/match/multiview-simnet/data_process.sh b/models/match/multiview-simnet/data_process.sh index 91548c5863063f7a23cb2f713a3754b121b235b5..c8633cc7a41f62a29eee1778251b72a6f3b601eb 100755 --- a/models/match/multiview-simnet/data_process.sh +++ b/models/match/multiview-simnet/data_process.sh @@ -22,4 +22,3 @@ mkdir -p data/train mkdir -p data/test python generate_synthetic_data.py - diff --git a/models/match/multiview-simnet/evaluate_reader.py b/models/match/multiview-simnet/evaluate_reader.py index e0f8f9e43de80d003834056ea417914f1d10e898..d77032f3ca4e07cbbf20874f79023dc4a6fed8b4 100755 --- a/models/match/multiview-simnet/evaluate_reader.py +++ b/models/match/multiview-simnet/evaluate_reader.py @@ -18,8 +18,10 @@ from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") - self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", + None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", + None, "train.model") self.all_slots = [] for i in range(self.query_slots): diff --git a/models/match/multiview-simnet/generate_synthetic_data.py b/models/match/multiview-simnet/generate_synthetic_data.py index d453e031cdca9be29892b913ea5f2636a6c05f5e..eb60e5c82f9decc2cfcd87da7bc6832ca98ee9d4 100755 --- a/models/match/multiview-simnet/generate_synthetic_data.py +++ b/models/match/multiview-simnet/generate_synthetic_data.py @@ -21,7 +21,11 @@ class Dataset: class SyntheticDataset(Dataset): - def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000): + def __init__(self, + sparse_feature_dim, + query_slot_num, + title_slot_num, + dataset_size=10000): # ids are randomly generated self.ids_per_slot = 10 self.sparse_feature_dim = sparse_feature_dim @@ -46,14 +50,20 @@ class SyntheticDataset(Dataset): for i in range(self.title_slot_num): pt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) - pt_slot = [str(fea) + ':' + str(i + self.query_slot_num) for fea in pt_slot] + pt_slot = [ + str(fea) + ':' + str(i + self.query_slot_num) + for fea in pt_slot + ] pos_title_slots += pt_slot if is_train: for i in range(self.title_slot_num): nt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) - nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in - nt_slot] + nt_slot = [ + str(fea) + ':' + + str(i + self.query_slot_num + self.title_slot_num) + for fea in nt_slot + ] neg_title_slots += nt_slot yield query_slots + pos_title_slots + neg_title_slots else: @@ -76,7 +86,8 @@ if __name__ == '__main__': query_slots = 1 title_slots = 1 dataset_size = 10 - dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size) + dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, + dataset_size) train_reader = dataset.train() test_reader = dataset.test() diff --git a/models/match/multiview-simnet/model.py b/models/match/multiview-simnet/model.py index 5ba9fb5d05b27339d924bfe42c0e6ba0c2c68da3..f80a1cd0390f3c7aafc772ef535eb36b9657b439 100755 --- a/models/match/multiview-simnet/model.py +++ b/models/match/multiview-simnet/model.py @@ -103,12 +103,18 @@ class Model(ModelBase): def init_config(self): self._fetch_interval = 1 - query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace) - title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace) - query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace) - title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim", None, self._namespace) - query_slots = envs.get_global_env("hyper_parameters.query_slots", None, self._namespace) - title_slots = envs.get_global_env("hyper_parameters.title_slots", None, self._namespace) + query_encoder = envs.get_global_env("hyper_parameters.query_encoder", + None, self._namespace) + title_encoder = envs.get_global_env("hyper_parameters.title_encoder", + None, self._namespace) + query_encode_dim = envs.get_global_env( + "hyper_parameters.query_encode_dim", None, self._namespace) + title_encode_dim = envs.get_global_env( + "hyper_parameters.title_encode_dim", None, self._namespace) + query_slots = envs.get_global_env("hyper_parameters.query_slots", None, + self._namespace) + title_slots = envs.get_global_env("hyper_parameters.title_slots", None, + self._namespace) factory = SimpleEncoderFactory() self.query_encoders = [ factory.create(query_encoder, query_encode_dim) @@ -119,10 +125,13 @@ class Model(ModelBase): for i in range(title_slots) ] - self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) + self.emb_size = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None, self._namespace) + self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", + None, self._namespace) self.emb_shape = [self.emb_size, self.emb_dim] - self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) + self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", + None, self._namespace) self.margin = 0.1 def input(self, is_train=True): @@ -133,8 +142,10 @@ class Model(ModelBase): ] self.pt_slots = [ fluid.data( - name="%d" % (i + len(self.query_encoders)), shape=[None, 1], lod_level=1, dtype='int64') - for i in range(len(self.title_encoders)) + name="%d" % (i + len(self.query_encoders)), + shape=[None, 1], + lod_level=1, + dtype='int64') for i in range(len(self.title_encoders)) ] if is_train == False: @@ -142,9 +153,11 @@ class Model(ModelBase): self.nt_slots = [ fluid.data( - name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, - dtype='int64') - for i in range(len(self.title_encoders)) + name="%d" % + (i + len(self.query_encoders) + len(self.title_encoders)), + shape=[None, 1], + lod_level=1, + dtype='int64') for i in range(len(self.title_encoders)) ] return self.q_slots + self.pt_slots + self.nt_slots @@ -153,11 +166,15 @@ class Model(ModelBase): res = self.input() self._data_var = res - use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", + False, self._namespace) if self._platform != "LINUX" or use_dataloader: self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) + feed_list=self._data_var, + capacity=256, + use_double_buffer=False, + iterable=False) def get_acc(self, x, y): less = tensor.cast(cf.less_than(x, y), dtype='float32') @@ -190,10 +207,12 @@ class Model(ModelBase): self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] pt_encodes = [ - self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) + self.title_encoders[i].forward(emb) + for i, emb in enumerate(pt_embs) ] nt_encodes = [ - self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs) + self.title_encoders[i].forward(emb) + for i, emb in enumerate(nt_embs) ] # concat multi view for query, pos_title, neg_title @@ -252,7 +271,8 @@ class Model(ModelBase): self.metrics() def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) return optimizer @@ -261,7 +281,10 @@ class Model(ModelBase): self._infer_data_var = res self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def infer_net(self): self.infer_input() @@ -281,7 +304,8 @@ class Model(ModelBase): self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] pt_encodes = [ - self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) + self.title_encoders[i].forward(emb) + for i, emb in enumerate(pt_embs) ] # concat multi view for query, pos_title, neg_title q_concat = fluid.layers.concat(q_encodes) diff --git a/models/match/multiview-simnet/reader.py b/models/match/multiview-simnet/reader.py index 43cd1a629a7540e727e423a98d497964203134ac..4c0e42a44b0ea05272c832d65a6cfbc0d3f6c495 100755 --- a/models/match/multiview-simnet/reader.py +++ b/models/match/multiview-simnet/reader.py @@ -18,8 +18,10 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") - self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", + None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", + None, "train.model") self.all_slots = [] for i in range(self.query_slots): diff --git a/models/match/readme.md b/models/match/readme.md index 6bccc109ff14582e816dee64b72b786a1e90f49e..d9f91b257d81ffde820a04cad49b56edbd903f6a 100755 --- a/models/match/readme.md +++ b/models/match/readme.md @@ -37,4 +37,3 @@ python -m paddlerec.run -m paddlerec.models.match.dssm # dssm python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet ``` - diff --git a/models/multitask/__init__.py b/models/multitask/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/multitask/__init__.py +++ b/models/multitask/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/multitask/esmm/esmm_infer_reader.py b/models/multitask/esmm/esmm_infer_reader.py index 8ca9eca67fdbb9e11f39db34b5dd9cfae518773b..70e3e989df611419f378a8920b499e42690d1cae 100644 --- a/models/multitask/esmm/esmm_infer_reader.py +++ b/models/multitask/esmm/esmm_infer_reader.py @@ -20,9 +20,11 @@ from paddlerec.core.reader import Reader class EvaluateReader(Reader): def init(self): - all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', - '129', - '205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] + all_field_id = [ + '101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', + '125', '126', '127', '128', '129', '205', '206', '207', '210', + '216', '508', '509', '702', '853', '301' + ] self.all_field_id_dict = defaultdict(int) for i, field_id in enumerate(all_field_id): self.all_field_id_dict[field_id] = [False, i] diff --git a/models/multitask/esmm/esmm_reader.py b/models/multitask/esmm/esmm_reader.py index 3d663038eefb4971b466336601ba436ff884e580..036e146ee923b6feda6398c7dcd49486eac51c50 100644 --- a/models/multitask/esmm/esmm_reader.py +++ b/models/multitask/esmm/esmm_reader.py @@ -21,9 +21,11 @@ from paddlerec.core.reader import Reader class TrainReader(Reader): def init(self): - all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', - '129', - '205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] + all_field_id = [ + '101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', + '125', '126', '127', '128', '129', '205', '206', '207', '210', + '216', '508', '509', '702', '853', '301' + ] self.all_field_id_dict = defaultdict(int) for i, field_id in enumerate(all_field_id): self.all_field_id_dict[field_id] = [False, i] diff --git a/models/multitask/esmm/model.py b/models/multitask/esmm/model.py index 8a8a203a87504cff310c0a799df40e937e2bbde8..71c6539579504407a22f3174407b517f9d9a55b5 100644 --- a/models/multitask/esmm/model.py +++ b/models/multitask/esmm/model.py @@ -28,11 +28,13 @@ class Model(ModelBase): init_stddev = 1.0 scales = 1.0 / np.sqrt(data.shape[1]) - p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, - initializer=fluid.initializer.NormalInitializer(loc=0.0, - scale=init_stddev * scales)) + p_attr = fluid.param_attr.ParamAttr( + name='%s_weight' % tag, + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=init_stddev * scales)) - b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) + b_attr = fluid.ParamAttr( + name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) out = fluid.layers.fc(input=data, size=out_dim, @@ -44,7 +46,11 @@ class Model(ModelBase): def input_data(self): sparse_input_ids = [ - fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23) + fluid.data( + name="field_" + str(i), + shape=[-1, 1], + dtype="int64", + lod_level=1) for i in range(0, 23) ] label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64") label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64") @@ -55,19 +61,23 @@ class Model(ModelBase): def net(self, inputs, is_infer=False): - vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) - embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace) + vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, + self._namespace) + embed_size = envs.get_global_env("hyper_parameters.embed_size", None, + self._namespace) emb = [] for data in inputs[0:-2]: - feat_emb = fluid.embedding(input=data, - size=[vocab_size, embed_size], - param_attr=fluid.ParamAttr(name='dis_emb', - learning_rate=5, - initializer=fluid.initializer.Xavier( - fan_in=embed_size, fan_out=embed_size) - ), - is_sparse=True) - field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum') + feat_emb = fluid.embedding( + input=data, + size=[vocab_size, embed_size], + param_attr=fluid.ParamAttr( + name='dis_emb', + learning_rate=5, + initializer=fluid.initializer.Xavier( + fan_in=embed_size, fan_out=embed_size)), + is_sparse=True) + field_emb = fluid.layers.sequence_pool( + input=feat_emb, pool_type='sum') emb.append(field_emb) concat_emb = fluid.layers.concat(emb, axis=1) @@ -85,14 +95,20 @@ class Model(ModelBase): ctr_clk = inputs[-2] ctcvr_buy = inputs[-1] - ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2]) - cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) + ctr_prop_one = fluid.layers.slice( + ctr_out, axes=[1], starts=[1], ends=[2]) + cvr_prop_one = fluid.layers.slice( + cvr_out, axes=[1], starts=[1], ends=[2]) - ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) - ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1) + ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, + cvr_prop_one) + ctcvr_prop = fluid.layers.concat( + input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1) - auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) - auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) + auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc( + input=ctr_out, label=ctr_clk) + auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc( + input=ctcvr_prop, label=ctcvr_buy) if is_infer: self._infer_results["AUC_ctr"] = auc_ctr @@ -100,7 +116,8 @@ class Model(ModelBase): return loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) - loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) + loss_ctcvr = fluid.layers.cross_entropy( + input=ctcvr_prop, label=ctcvr_buy) cost = loss_ctr + loss_ctcvr avg_cost = fluid.layers.mean(cost) @@ -117,5 +134,8 @@ class Model(ModelBase): def infer_net(self): self._infer_data_var = self.input_data() self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) self.net(self._infer_data_var, is_infer=True) diff --git a/models/multitask/mmoe/census_infer_reader.py b/models/multitask/mmoe/census_infer_reader.py index c62de8e69ce6ccfbb4df1e1252d9630a84fc56b3..fada3990fdcc756a2938c5a4fd763f022dda53c4 100644 --- a/models/multitask/mmoe/census_infer_reader.py +++ b/models/multitask/mmoe/census_infer_reader.py @@ -19,6 +19,7 @@ from paddlerec.core.reader import Reader class EvaluateReader(Reader): def init(self): + pass def generate_sample(self, line): diff --git a/models/multitask/mmoe/census_reader.py b/models/multitask/mmoe/census_reader.py index 211e566882e5d8a7f50f22b0a1628307777099c8..d71133bd91692c8b17e7449aa305e5241db7777a 100644 --- a/models/multitask/mmoe/census_reader.py +++ b/models/multitask/mmoe/census_reader.py @@ -24,6 +24,7 @@ class TrainReader(Reader): def generate_sample(self, line): """ Read the data line by line and process it as a dictionary + """ def reader(): diff --git a/models/multitask/mmoe/model.py b/models/multitask/mmoe/model.py index 525e9d5cc0086757901262253cf0f23ee72f314c..035733690f46960906c902dbe240603acd136565 100644 --- a/models/multitask/mmoe/model.py +++ b/models/multitask/mmoe/model.py @@ -23,44 +23,58 @@ class Model(ModelBase): ModelBase.__init__(self, config) def MMOE(self, is_infer=False): - - feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace) - expert_num = envs.get_global_env("hyper_parameters.expert_num", None, self._namespace) - gate_num = envs.get_global_env("hyper_parameters.gate_num", None, self._namespace) - expert_size = envs.get_global_env("hyper_parameters.expert_size", None, self._namespace) - tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace) - - input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") - label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) - label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) + feature_size = envs.get_global_env("hyper_parameters.feature_size", + None, self._namespace) + expert_num = envs.get_global_env("hyper_parameters.expert_num", None, + self._namespace) + gate_num = envs.get_global_env("hyper_parameters.gate_num", None, + self._namespace) + expert_size = envs.get_global_env("hyper_parameters.expert_size", None, + self._namespace) + tower_size = envs.get_global_env("hyper_parameters.tower_size", None, + self._namespace) + + input_data = fluid.data( + name="input", shape=[-1, feature_size], dtype="float32") + label_income = fluid.data( + name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) + label_marital = fluid.data( + name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) if is_infer: self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) self._data_var.extend([input_data, label_income, label_marital]) # f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper expert_outputs = [] for i in range(0, expert_num): - expert_output = fluid.layers.fc(input=input_data, - size=expert_size, - act='relu', - bias_attr=fluid.ParamAttr(learning_rate=1.0), - name='expert_' + str(i)) + expert_output = fluid.layers.fc( + input=input_data, + size=expert_size, + act='relu', + bias_attr=fluid.ParamAttr(learning_rate=1.0), + name='expert_' + str(i)) expert_outputs.append(expert_output) expert_concat = fluid.layers.concat(expert_outputs, axis=1) - expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size]) + expert_concat = fluid.layers.reshape(expert_concat, + [-1, expert_num, expert_size]) # g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper output_layers = [] for i in range(0, gate_num): - cur_gate = fluid.layers.fc(input=input_data, - size=expert_num, - act='softmax', - bias_attr=fluid.ParamAttr(learning_rate=1.0), - name='gate_' + str(i)) + cur_gate = fluid.layers.fc( + input=input_data, + size=expert_num, + act='softmax', + bias_attr=fluid.ParamAttr(learning_rate=1.0), + name='gate_' + str(i)) # f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x)) - cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0) + cur_gate_expert = fluid.layers.elementwise_mul( + expert_concat, cur_gate, axis=0) cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1) # Build tower layer cur_tower = fluid.layers.fc(input=cur_gate_expert, @@ -74,25 +88,33 @@ class Model(ModelBase): output_layers.append(out) - pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) - pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) - - label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) - label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2]) - - auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, - label=fluid.layers.cast(x=label_income_1, - dtype='int64')) - auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, - label=fluid.layers.cast(x=label_marital_1, - dtype='int64')) + pred_income = fluid.layers.clip( + output_layers[0], min=1e-15, max=1.0 - 1e-15) + pred_marital = fluid.layers.clip( + output_layers[1], min=1e-15, max=1.0 - 1e-15) + + label_income_1 = fluid.layers.slice( + label_income, axes=[1], starts=[1], ends=[2]) + label_marital_1 = fluid.layers.slice( + label_marital, axes=[1], starts=[1], ends=[2]) + + auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc( + input=pred_income, + label=fluid.layers.cast( + x=label_income_1, dtype='int64')) + auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc( + input=pred_marital, + label=fluid.layers.cast( + x=label_marital_1, dtype='int64')) if is_infer: self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_marital"] = auc_marital return - cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) - cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) + cost_income = fluid.layers.cross_entropy( + input=pred_income, label=label_income, soft_label=True) + cost_marital = fluid.layers.cross_entropy( + input=pred_marital, label=label_marital, soft_label=True) avg_cost_income = fluid.layers.mean(x=cost_income) avg_cost_marital = fluid.layers.mean(x=cost_marital) diff --git a/models/multitask/readme.md b/models/multitask/readme.md index d234f42f146e18bf254e518db0e78acc1e1d3e10..10e0641060f74b67b4987d14a1c4aad27a25b103 100755 --- a/models/multitask/readme.md +++ b/models/multitask/readme.md @@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm | Census-income Data | Share-Bottom | -- | 0.93120/0.99256 | | Census-income Data | MMoE | -- | 0.94465/0.99324 | | Ali-CCP | ESMM | -- | 0.97181/0.49967 | - diff --git a/models/multitask/share-bottom/model.py b/models/multitask/share-bottom/model.py index d570ba77067985b518247c8f6bba16a6431e1f9c..f19ecbe1c43323e30cb9a44eb281f31c68b69909 100644 --- a/models/multitask/share-bottom/model.py +++ b/models/multitask/share-bottom/model.py @@ -24,27 +24,38 @@ class Model(ModelBase): def model(self, is_infer=False): - feature_size = envs.get_global_env("hyper_parameters.feature_size", None, self._namespace) - bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None, self._namespace) - tower_size = envs.get_global_env("hyper_parameters.tower_size", None, self._namespace) - tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None, self._namespace) - - input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") - label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) - label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) + feature_size = envs.get_global_env("hyper_parameters.feature_size", + None, self._namespace) + bottom_size = envs.get_global_env("hyper_parameters.bottom_size", None, + self._namespace) + tower_size = envs.get_global_env("hyper_parameters.tower_size", None, + self._namespace) + tower_nums = envs.get_global_env("hyper_parameters.tower_nums", None, + self._namespace) + + input_data = fluid.data( + name="input", shape=[-1, feature_size], dtype="float32") + label_income = fluid.data( + name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) + label_marital = fluid.data( + name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) if is_infer: self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) self._data_var.extend([input_data, label_income, label_marital]) - bottom_output = fluid.layers.fc(input=input_data, - size=bottom_size, - act='relu', - bias_attr=fluid.ParamAttr(learning_rate=1.0), - name='bottom_output') + bottom_output = fluid.layers.fc( + input=input_data, + size=bottom_size, + act='relu', + bias_attr=fluid.ParamAttr(learning_rate=1.0), + name='bottom_output') # Build tower layer from bottom layer output_layers = [] @@ -59,26 +70,34 @@ class Model(ModelBase): name='output_layer_' + str(index)) output_layers.append(output_layer) - pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) - pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) - - label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) - label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2]) - - auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, - label=fluid.layers.cast(x=label_income_1, - dtype='int64')) - auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, - label=fluid.layers.cast(x=label_marital_1, - dtype='int64')) + pred_income = fluid.layers.clip( + output_layers[0], min=1e-15, max=1.0 - 1e-15) + pred_marital = fluid.layers.clip( + output_layers[1], min=1e-15, max=1.0 - 1e-15) + + label_income_1 = fluid.layers.slice( + label_income, axes=[1], starts=[1], ends=[2]) + label_marital_1 = fluid.layers.slice( + label_marital, axes=[1], starts=[1], ends=[2]) + + auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc( + input=pred_income, + label=fluid.layers.cast( + x=label_income_1, dtype='int64')) + auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc( + input=pred_marital, + label=fluid.layers.cast( + x=label_marital_1, dtype='int64')) if is_infer: self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_marital"] = auc_marital return - cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) - cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) + cost_income = fluid.layers.cross_entropy( + input=pred_income, label=label_income, soft_label=True) + cost_marital = fluid.layers.cross_entropy( + input=pred_marital, label=label_marital, soft_label=True) cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1) avg_cost = fluid.layers.mean(x=cost) diff --git a/models/rank/__init__.py b/models/rank/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/rank/__init__.py +++ b/models/rank/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/rank/dcn/data/download.py b/models/rank/dcn/data/download.py index d9bcc6df296068cfd5cd9fc1c91165f11b580d04..4203a3868a577757930ae848736c34bb4da376c7 100755 --- a/models/rank/dcn/data/download.py +++ b/models/rank/dcn/data/download.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import sys import io diff --git a/models/rank/dcn/data/get_slot_data.py b/models/rank/dcn/data/get_slot_data.py index 77b30296ab9ba99757039f053c6133fd175c2811..96d4448214d6a87092495326646a279657079f45 100755 --- a/models/rank/dcn/data/get_slot_data.py +++ b/models/rank/dcn/data/get_slot_data.py @@ -26,8 +26,8 @@ from collections import Counter import os import paddle.fluid.incubate.data_generator as dg -class TrainReader(dg.MultiSlotDataGenerator): +class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator): if idx == 2 else math.log(1 + float(features[idx]))) for idx in self.cat_idx_: if features[idx] == '' or features[ - idx] not in self.cat_feat_idx_dict_list[idx - 14]: + idx] not in self.cat_feat_idx_dict_list[idx - 14]: label_feat_list[idx].append(0) else: label_feat_list[idx].append(self.cat_feat_idx_dict_list[ - idx - 14][features[idx]]) + idx - 14][features[idx]]) label_feat_list[0].append(int(features[0])) return label_feat_list @@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator): return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/rank/dcn/data/preprocess.py b/models/rank/dcn/data/preprocess.py index b356607729eedd73854a77449ffda3cc3bb8050f..9a89df10ef42dcfa09faad66f409b21439f340a8 100755 --- a/models/rank/dcn/data/preprocess.py +++ b/models/rank/dcn/data/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function, absolute_import, division import os diff --git a/models/rank/dcn/model.py b/models/rank/dcn/model.py index bf3e3051ca3ac92d1e354c6f59313ce496ff2921..67447fedefd180649bb018a3ea23aea216c9a2b4 100755 --- a/models/rank/dcn/model.py +++ b/models/rank/dcn/model.py @@ -25,12 +25,18 @@ class Model(ModelBase): ModelBase.__init__(self, config) def init_network(self): - self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace) - self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace) - self.l2_reg_cross = envs.get_global_env("hyper_parameters.l2_reg_cross", None, self._namespace) - self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", None, self._namespace) - self.clip_by_norm = envs.get_global_env("hyper_parameters.clip_by_norm", None, self._namespace) - cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", None, self._namespace) + self.cross_num = envs.get_global_env("hyper_parameters.cross_num", + None, self._namespace) + self.dnn_hidden_units = envs.get_global_env( + "hyper_parameters.dnn_hidden_units", None, self._namespace) + self.l2_reg_cross = envs.get_global_env( + "hyper_parameters.l2_reg_cross", None, self._namespace) + self.dnn_use_bn = envs.get_global_env("hyper_parameters.dnn_use_bn", + None, self._namespace) + self.clip_by_norm = envs.get_global_env( + "hyper_parameters.clip_by_norm", None, self._namespace) + cat_feat_num = envs.get_global_env("hyper_parameters.cat_feat_num", + None, self._namespace) self.sparse_inputs = self._sparse_data_var[1:] self.dense_inputs = self._dense_data_var @@ -43,7 +49,8 @@ class Model(ModelBase): cat_feat_dims_dict[spls[0]] = int(spls[1]) self.cat_feat_dims_dict = cat_feat_dims_dict if cat_feat_dims_dict else OrderedDict( ) - self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", None, self._namespace) + self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", + None, self._namespace) self.dense_feat_names = [i.name for i in self.dense_inputs] self.sparse_feat_names = [i.name for i in self.sparse_inputs] @@ -55,16 +62,19 @@ class Model(ModelBase): self.net_input = None self.loss = None - + def _create_embedding_input(self): # sparse embedding sparse_emb_dict = OrderedDict() for var in self.sparse_inputs: - sparse_emb_dict[var.name] = fluid.embedding(input=var, - size=[self.feat_dims_dict[var.name] + 1, - 6 * int(pow(self.feat_dims_dict[var.name], 0.25)) - ],is_sparse=self.is_sparse) - + sparse_emb_dict[var.name] = fluid.embedding( + input=var, + size=[ + self.feat_dims_dict[var.name] + 1, + 6 * int(pow(self.feat_dims_dict[var.name], 0.25)) + ], + is_sparse=self.is_sparse) + # combine dense and sparse_emb dense_input_list = self.dense_inputs sparse_emb_list = list(sparse_emb_dict.values()) @@ -114,10 +124,11 @@ class Model(ModelBase): def train_net(self): self.model._init_slots() self.init_network() - + self.net_input = self._create_embedding_input() - - deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False) + + deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, + self.dnn_use_bn, False) cross_out, l2_reg_cross_loss = self._cross_net(self.net_input, self.cross_num) @@ -134,9 +145,11 @@ class Model(ModelBase): input=prob_2d, label=label_int, slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - + # logloss - logloss = fluid.layers.log_loss(self.prob, fluid.layers.cast(self.target_input, dtype='float32')) + logloss = fluid.layers.log_loss( + self.prob, fluid.layers.cast( + self.target_input, dtype='float32')) self.avg_logloss = fluid.layers.reduce_mean(logloss) # reg_coeff * l2_reg_cross @@ -145,7 +158,8 @@ class Model(ModelBase): self._cost = self.loss def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer diff --git a/models/rank/deepfm/data/download_preprocess.py b/models/rank/deepfm/data/download_preprocess.py index e8c94cc64728a5e3ae38a29bf419fc90b55df597..7a504b4f88e49d8b4f242d4d6b56f6f168464e5c 100755 --- a/models/rank/deepfm/data/download_preprocess.py +++ b/models/rank/deepfm/data/download_preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import shutil import sys diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index 59dc33b0d7b6aa1b2087134c9952fc160ca6cd04..6177c990d8ef0c8a1cf922dd9d50c6419cb8c1b7 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -19,8 +19,9 @@ try: import cPickle as pickle except ImportError: import pickle -class TrainReader(dg.MultiSlotDataGenerator): + +class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator): self.categorical_range_ = range(14, 40) # load preprocessed feature dict self.feat_dict_name = "aid_data/feat_dict_10.pkl2" - self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) + self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) def _process_line(self, line): features = line.rstrip('\n').split('\t') @@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator): def data_iter(): feat_idx, feat_value, label = self._process_line(line) s = "" - for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]: + for i in [('feat_idx', feat_idx), ('feat_value', feat_value), + ('label', label)]: k = i[0] v = i[1] for j in v: s += " " + k + ":" + str(j) print s.strip() yield None + return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/rank/deepfm/data/preprocess.py b/models/rank/deepfm/data/preprocess.py index 1fa4a5feae17bde64463d2f05beb3d053284dcda..9da3bdc3d93bfcd0dd98fddc64c870d20feddb38 100755 --- a/models/rank/deepfm/data/preprocess.py +++ b/models/rank/deepfm/data/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import numpy from collections import Counter diff --git a/models/rank/deepfm/model.py b/models/rank/deepfm/model.py index bfda02a21dcc7949b487ef074a783d2f24bcd1f5..0c87b8c869db27b055038e00fe9f6a2efbeb1e29 100755 --- a/models/rank/deepfm/model.py +++ b/models/rank/deepfm/model.py @@ -27,21 +27,26 @@ class Model(ModelBase): def deepfm_net(self): init_value_ = 0.1 is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) - sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) + sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number", None, self._namespace) + sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None, self._namespace) # ------------------------- network input -------------------------- - num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) - + num_field = envs.get_global_env("hyper_parameters.num_field", None, + self._namespace) + raw_feat_idx = self._sparse_data_var[1] raw_feat_value = self._dense_data_var[0] self.label = self._sparse_data_var[0] - + feat_idx = raw_feat_idx - feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 - - reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace) + feat_value = fluid.layers.reshape( + raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 + + reg = envs.get_global_env("hyper_parameters.reg", 1e-4, + self._namespace) first_weights_re = fluid.embedding( input=feat_idx, is_sparse=True, @@ -55,7 +60,8 @@ class Model(ModelBase): regularizer=fluid.regularizer.L1DecayRegularizer(reg))) first_weights = fluid.layers.reshape( first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1 - y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1) + y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), + 1) # ------------------------- second order term -------------------------- @@ -68,7 +74,8 @@ class Model(ModelBase): padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.TruncatedNormalInitializer( - loc=0.0, scale=init_value_ / math.sqrt(float(sparse_feature_dim))))) + loc=0.0, + scale=init_value_ / math.sqrt(float(sparse_feature_dim))))) feat_embeddings = fluid.layers.reshape( feat_embeddings_re, shape=[-1, num_field, @@ -76,8 +83,8 @@ class Model(ModelBase): feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size # sum_square part - summed_features_emb = fluid.layers.reduce_sum(feat_embeddings, - 1) # None * embedding_size + summed_features_emb = fluid.layers.reduce_sum( + feat_embeddings, 1) # None * embedding_size summed_features_emb_square = fluid.layers.square( summed_features_emb) # None * embedding_size @@ -88,13 +95,16 @@ class Model(ModelBase): squared_features_emb, 1) # None * embedding_size y_second_order = 0.5 * fluid.layers.reduce_sum( - summed_features_emb_square - squared_sum_features_emb, 1, + summed_features_emb_square - squared_sum_features_emb, + 1, keep_dim=True) # None * 1 # ------------------------- DNN -------------------------- - layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) - act = envs.get_global_env("hyper_parameters.act", None, self._namespace) + layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, + self._namespace) + act = envs.get_global_env("hyper_parameters.act", None, + self._namespace) y_dnn = fluid.layers.reshape(feat_embeddings, [-1, num_field * sparse_feature_dim]) for s in layer_sizes: @@ -121,7 +131,8 @@ class Model(ModelBase): # ------------------------- DeepFM -------------------------- - self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) + self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + + y_dnn) def train_net(self): self.model._init_slots() @@ -129,7 +140,8 @@ class Model(ModelBase): # ------------------------- Cost(logloss) -------------------------- - cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32")) + cost = fluid.layers.log_loss( + input=self.predict, label=fluid.layers.cast(self.label, "float32")) avg_cost = fluid.layers.reduce_sum(cost) self._cost = avg_cost @@ -145,7 +157,8 @@ class Model(ModelBase): self._metrics["BATCH_AUC"] = batch_auc_var def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer diff --git a/models/rank/din/data/build_dataset.py b/models/rank/din/data/build_dataset.py index 34c053ccdb2686c10875740f72f1e0abf3cb4f10..b0ed187800b2f9f44d4dd0d34df204759059ac06 100755 --- a/models/rank/din/data/build_dataset.py +++ b/models/rank/din/data/build_dataset.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import random import pickle diff --git a/models/rank/din/data/convert_pd.py b/models/rank/din/data/convert_pd.py index d7927c7ef1a9da28732cad9c44be24e72095983a..a66290e1561084a10756ab98c3d70b9a5ac5a6ed 100755 --- a/models/rank/din/data/convert_pd.py +++ b/models/rank/din/data/convert_pd.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import pickle import pandas as pd diff --git a/models/rank/din/data/remap_id.py b/models/rank/din/data/remap_id.py index b110dac54de8f8d201ede7248d6a2844ac350c90..ee6983d7f0769a58352f61a0a05bbd81c6ccbc13 100755 --- a/models/rank/din/data/remap_id.py +++ b/models/rank/din/data/remap_id.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import random import pickle diff --git a/models/rank/din/model.py b/models/rank/din/model.py index 2abc658b6d5cb58aaff222e1121d2c4282bcd65f..c2acbe66b6c704655cf9a5aff86d583233672f6c 100755 --- a/models/rank/din/model.py +++ b/models/rank/din/model.py @@ -21,14 +21,14 @@ from paddlerec.core.model import Model as ModelBase class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - + def config_read(self, config_path): with open(config_path, "r") as fin: user_count = int(fin.readline().strip()) item_count = int(fin.readline().strip()) cat_count = int(fin.readline().strip()) return user_count, item_count, cat_count - + def din_attention(self, hist, target_expand, mask): """activation weight""" @@ -58,56 +58,66 @@ class Model(ModelBase): out = fluid.layers.matmul(weight, hist) out = fluid.layers.reshape(x=out, shape=[0, hidden_size]) return out - + def train_net(self): seq_len = -1 - self.item_emb_size = envs.get_global_env("hyper_parameters.item_emb_size", 64, self._namespace) - self.cat_emb_size = envs.get_global_env("hyper_parameters.cat_emb_size", 64, self._namespace) - self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", self._namespace) + self.item_emb_size = envs.get_global_env( + "hyper_parameters.item_emb_size", 64, self._namespace) + self.cat_emb_size = envs.get_global_env( + "hyper_parameters.cat_emb_size", 64, self._namespace) + self.act = envs.get_global_env("hyper_parameters.act", "sigmoid", + self._namespace) #item_emb_size = 64 #cat_emb_size = 64 - self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", False, self._namespace) + self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", + False, self._namespace) #significant for speeding up the training process - self.config_path = envs.get_global_env("hyper_parameters.config_path", "data/config.txt", self._namespace) - self.use_DataLoader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + self.config_path = envs.get_global_env( + "hyper_parameters.config_path", "data/config.txt", self._namespace) + self.use_DataLoader = envs.get_global_env( + "hyper_parameters.use_DataLoader", False, self._namespace) user_count, item_count, cat_count = self.config_read(self.config_path) - item_emb_attr = fluid.ParamAttr(name="item_emb") cat_emb_attr = fluid.ParamAttr(name="cat_emb") hist_item_seq = fluid.data( name="hist_item_seq", shape=[None, seq_len], dtype="int64") self._data_var.append(hist_item_seq) - + hist_cat_seq = fluid.data( name="hist_cat_seq", shape=[None, seq_len], dtype="int64") self._data_var.append(hist_cat_seq) - - target_item = fluid.data(name="target_item", shape=[None], dtype="int64") + + target_item = fluid.data( + name="target_item", shape=[None], dtype="int64") self._data_var.append(target_item) - + target_cat = fluid.data(name="target_cat", shape=[None], dtype="int64") self._data_var.append(target_cat) - + label = fluid.data(name="label", shape=[None, 1], dtype="float32") self._data_var.append(label) - - mask = fluid.data(name="mask", shape=[None, seq_len, 1], dtype="float32") + + mask = fluid.data( + name="mask", shape=[None, seq_len, 1], dtype="float32") self._data_var.append(mask) - + target_item_seq = fluid.data( name="target_item_seq", shape=[None, seq_len], dtype="int64") self._data_var.append(target_item_seq) - + target_cat_seq = fluid.data( name="target_cat_seq", shape=[None, seq_len], dtype="int64") self._data_var.append(target_cat_seq) if self.use_DataLoader: self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=10000, use_double_buffer=False, iterable=False) - + feed_list=self._data_var, + capacity=10000, + use_double_buffer=False, + iterable=False) + hist_item_emb = fluid.embedding( input=hist_item_seq, size=[item_count, self.item_emb_size], @@ -149,7 +159,8 @@ class Model(ModelBase): size=[item_count, 1], param_attr=fluid.initializer.Constant(value=0.0)) - hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2) + hist_seq_concat = fluid.layers.concat( + [hist_item_emb, hist_cat_emb], axis=2) target_seq_concat = fluid.layers.concat( [target_item_seq_emb, target_cat_seq_emb], axis=2) target_concat = fluid.layers.concat( @@ -157,21 +168,22 @@ class Model(ModelBase): out = self.din_attention(hist_seq_concat, target_seq_concat, mask) out_fc = fluid.layers.fc(name="out_fc", - input=out, - size=self.item_emb_size + self.cat_emb_size, - num_flatten_dims=1) + input=out, + size=self.item_emb_size + self.cat_emb_size, + num_flatten_dims=1) embedding_concat = fluid.layers.concat([out_fc, target_concat], axis=1) fc1 = fluid.layers.fc(name="fc1", - input=embedding_concat, - size=80, - act=self.act) + input=embedding_concat, + size=80, + act=self.act) fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act) fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1) logit = fc3 + item_b - loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label) - + loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=logit, label=label) + avg_loss = fluid.layers.mean(loss) self._cost = avg_loss @@ -179,14 +191,14 @@ class Model(ModelBase): predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) label_int = fluid.layers.cast(label, 'int64') auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, - label=label_int, - slide_steps=0) + label=label_int, + slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer diff --git a/models/rank/din/reader.py b/models/rank/din/reader.py index 39ed690fdc6fd35d50ebdcb46b5becc5ae399b62..aba06141da6c60beb81ea446d3c7e7dc8a731df9 100755 --- a/models/rank/din/reader.py +++ b/models/rank/din/reader.py @@ -29,13 +29,15 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader") + self.train_data_path = envs.get_global_env("train_data_path", None, + "train.reader") self.res = [] self.max_len = 0 data_file_list = os.listdir(self.train_data_path) for i in range(0, len(data_file_list)): - train_data_file = os.path.join(self.train_data_path, data_file_list[i]) + train_data_file = os.path.join(self.train_data_path, + data_file_list[i]) with open(train_data_file, "r") as fin: for line in fin: line = line.strip().split(';') @@ -78,11 +80,13 @@ class TrainReader(Reader): len_array = [len(x[0]) for x in b] mask = np.array( [[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape( - [-1, max_len, 1]) + [-1, max_len, 1]) target_item_seq = np.array( - [[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) + [[x[2]] * max_len for x in b]).astype("int64").reshape( + [-1, max_len]) target_cat_seq = np.array( - [[x[3]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) + [[x[3]] * max_len for x in b]).astype("int64").reshape( + [-1, max_len]) res = [] for i in range(len(b)): res.append([ @@ -127,4 +131,5 @@ class TrainReader(Reader): def generate_batch_from_trainfiles(self, files): data_set = self.base_read(files) random.shuffle(data_set) - return self.batch_reader(data_set, self.batch_size, self.batch_size * 20) + return self.batch_reader(data_set, self.batch_size, + self.batch_size * 20) diff --git a/models/rank/dnn/data/get_slot_data.py b/models/rank/dnn/data/get_slot_data.py index 30ad9884e5b3c4cd600e8273b9d061bfe1398c9e..f52447d06c297335685a704f688d71aa871328bc 100755 --- a/models/rank/dnn/data/get_slot_data.py +++ b/models/rank/dnn/data/get_slot_data.py @@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator): """ Read the data line by line and process it as a dictionary """ + def reader(): """ This function needs to be implemented by the user, based on data format @@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator): feature_name.append("label") s = "click:" + str(label[0]) for i in dense_feature: - s += " dense_feature:" + str(i) + s += " dense_feature:" + str(i) for i in range(1, 1 + len(categorical_range_)): - s += " " + str(i) + ":" + str(sparse_feature[i-1][0]) + s += " " + str(i) + ":" + str(sparse_feature[i - 1][0]) print s.strip() yield None + return reader diff --git a/models/rank/dnn/model.py b/models/rank/dnn/model.py index 3a61d288b40545619a49e81df1f6160670a6a0c1..d7ab801f38fdffbdeb0ca5259abaec37136d3fc9 100755 --- a/models/rank/dnn/model.py +++ b/models/rank/dnn/model.py @@ -31,8 +31,10 @@ class Model(ModelBase): def net(self): is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) - sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) + sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number", None, self._namespace) + sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None, self._namespace) def embedding_layer(input): emb = fluid.layers.embedding( @@ -42,25 +44,27 @@ class Model(ModelBase): size=[sparse_feature_number, sparse_feature_dim], param_attr=fluid.ParamAttr( name="SparseFeatFactors", - initializer=fluid.initializer.Uniform()), - ) - emb_sum = fluid.layers.sequence_pool( - input=emb, pool_type='sum') + initializer=fluid.initializer.Uniform()), ) + emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') return emb_sum def fc(input, output_size): output = fluid.layers.fc( - input=input, size=output_size, - act='relu', param_attr=fluid.ParamAttr( + input=input, + size=output_size, + act='relu', + param_attr=fluid.ParamAttr( initializer=fluid.initializer.Normal( scale=1.0 / math.sqrt(input.shape[1])))) return output sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs)) - concated = fluid.layers.concat(sparse_embed_seq + [self.dense_input], axis=1) + concated = fluid.layers.concat( + sparse_embed_seq + [self.dense_input], axis=1) fcs = [concated] - hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) + hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, + self._namespace) for size in hidden_layers: fcs.append(fc(fcs[-1], size)) @@ -75,14 +79,15 @@ class Model(ModelBase): self.predict = predict def avg_loss(self): - cost = fluid.layers.cross_entropy(input=self.predict, label=self.label_input) + cost = fluid.layers.cross_entropy( + input=self.predict, label=self.label_input) avg_cost = fluid.layers.reduce_mean(cost) self._cost = avg_cost def metrics(self): auc, batch_auc, _ = fluid.layers.auc(input=self.predict, label=self.label_input, - num_thresholds=2 ** 12, + num_thresholds=2**12, slide_steps=20) self._metrics["AUC"] = auc self._metrics["BATCH_AUC"] = batch_auc @@ -95,7 +100,8 @@ class Model(ModelBase): self.metrics() def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer diff --git a/models/rank/wide_deep/data/data_preparation.py b/models/rank/wide_deep/data/data_preparation.py index cdd8d4d7817e8312fe76f4038c6554eb557a2ff1..885070096cd3fd084e9695919121f782505b9e77 100644 --- a/models/rank/wide_deep/data/data_preparation.py +++ b/models/rank/wide_deep/data/data_preparation.py @@ -1,10 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import io import args import pandas as pd -from sklearn import preprocessing +from sklearn import preprocessing + -def _clean_file(source_path,target_path): +def _clean_file(source_path, target_path): """makes changes to match the CSV format.""" with io.open(source_path, 'r') as temp_eval_file: with io.open(target_path, 'w') as eval_file: @@ -17,15 +32,16 @@ def _clean_file(source_path,target_path): line = line[:-1] line += '\n' eval_file.write(line) - + + def build_model_columns(train_data_path, test_data_path): # The column names are from # https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html column_names = [ - 'age', 'workclass', 'fnlwgt', 'education', 'education_num', - 'marital_status', 'occupation', 'relationship', 'race', 'gender', - 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', - 'income_bracket' + 'age', 'workclass', 'fnlwgt', 'education', 'education_num', + 'marital_status', 'occupation', 'relationship', 'race', 'gender', + 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', + 'income_bracket' ] # Load the dataset in Pandas @@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path): # First group of tasks according to the paper #label_columns = ['income_50k', 'marital_stat'] - categorical_columns = ['education','marital_status','relationship','workclass','occupation'] + categorical_columns = [ + 'education', 'marital_status', 'relationship', 'workclass', + 'occupation' + ] for col in categorical_columns: label_train = preprocessing.LabelEncoder() - train_df[col]= label_train.fit_transform(train_df[col]) + train_df[col] = label_train.fit_transform(train_df[col]) label_test = preprocessing.LabelEncoder() - test_df[col]= label_test.fit_transform(test_df[col]) - - bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] - train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False) - test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False) - - base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets'] - - train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) - test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str) - train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) - test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str) - crossed_columns = ['education_occupation','age_buckets_education_occupation'] - + test_df[col] = label_test.fit_transform(test_df[col]) + + bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] + train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), + bins, + labels=False) + test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), + bins, + labels=False) + + base_columns = [ + 'education', 'marital_status', 'relationship', 'workclass', + 'occupation', 'age_buckets' + ] + + train_df['education_occupation'] = train_df['education'].astype( + str) + '_' + train_df['occupation'].astype(str) + test_df['education_occupation'] = test_df['education'].astype( + str) + '_' + test_df['occupation'].astype(str) + train_df['age_buckets_education_occupation'] = train_df[ + 'age_buckets'].astype(str) + '_' + train_df['education'].astype( + str) + '_' + train_df['occupation'].astype(str) + test_df['age_buckets_education_occupation'] = test_df[ + 'age_buckets'].astype(str) + '_' + test_df['education'].astype( + str) + '_' + test_df['occupation'].astype(str) + crossed_columns = [ + 'education_occupation', 'age_buckets_education_occupation' + ] + for col in crossed_columns: label_train = preprocessing.LabelEncoder() - train_df[col]= label_train.fit_transform(train_df[col]) + train_df[col] = label_train.fit_transform(train_df[col]) label_test = preprocessing.LabelEncoder() - test_df[col]= label_test.fit_transform(test_df[col]) - + test_df[col] = label_test.fit_transform(test_df[col]) + wide_columns = base_columns + crossed_columns - - train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns) - test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns) + + train_df_temp = pd.get_dummies( + train_df[categorical_columns], columns=categorical_columns) + test_df_temp = pd.get_dummies( + test_df[categorical_columns], columns=categorical_columns) train_df = train_df.join(train_df_temp) test_df = test_df.join(test_df_temp) - - deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week'] - - train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) - test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) - - with io.open('train_data/columns.txt','w') as f: - write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' + + deep_columns = list(train_df_temp.columns) + [ + 'age', 'education_num', 'capital_gain', 'capital_loss', + 'hours_per_week' + ] + + train_df['label'] = train_df['income_bracket'].apply( + lambda x: 1 if x == '>50K' else 0) + test_df['label'] = test_df['income_bracket'].apply( + lambda x: 1 if x == '>50K' else 0) + + with io.open('train_data/columns.txt', 'w') as f: + write_str = str(len(wide_columns)) + '\n' + str(len( + deep_columns)) + '\n' f.write(write_str) f.close() - with io.open('test_data/columns.txt','w') as f: - write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' + with io.open('test_data/columns.txt', 'w') as f: + write_str = str(len(wide_columns)) + '\n' + str(len( + deep_columns)) + '\n' f.write(write_str) f.close() - - train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False) - test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False) + + train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv( + train_data_path, index=False) + test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv( + test_data_path, index=False) def clean_file(train_path, test_path, train_data_path, test_data_path): _clean_file(train_path, train_data_path) _clean_file(test_path, test_data_path) + if __name__ == '__main__': args = args.parse_args() - clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path) + clean_file(args.train_path, args.test_path, args.train_data_path, + args.test_data_path) build_model_columns(args.train_data_path, args.test_data_path) diff --git a/models/rank/wide_deep/data/get_slot_data.py b/models/rank/wide_deep/data/get_slot_data.py index b928ae1267113215aa2b71f8dccffddc0db048fb..831d05665b01649f22a3270ec949ebda2941928d 100755 --- a/models/rank/wide_deep/data/get_slot_data.py +++ b/models/rank/wide_deep/data/get_slot_data.py @@ -20,6 +20,7 @@ except ImportError: import pickle import paddle.fluid.incubate.data_generator as dg + class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator): wide_feat, deep_deat, label = self._process_line(line) s = "" - for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]: + for i in [('wide_input', wide_feat), ('deep_input', deep_deat), + ('label', label)]: k = i[0] v = i[1] for j in v: @@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator): return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/rank/wide_deep/model.py b/models/rank/wide_deep/model.py index 27eb5e1f0c1588d7634407a3dcd250726dea28bb..a7d51d958c55f0368cdd7f9ff7baa51dd25a6f76 100755 --- a/models/rank/wide_deep/model.py +++ b/models/rank/wide_deep/model.py @@ -25,27 +25,27 @@ class Model(ModelBase): ModelBase.__init__(self, config) def wide_part(self, data): - out = fluid.layers.fc(input=data, - size=1, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, - scale=1.0 / math.sqrt( - data.shape[ - 1])), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4)), - act=None, - name='wide') + out = fluid.layers.fc( + input=data, + size=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal( + loc=0.0, scale=1.0 / math.sqrt(data.shape[1])), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4)), + act=None, + name='wide') return out def fc(self, data, hidden_units, active, tag): - output = fluid.layers.fc(input=data, - size=hidden_units, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, - scale=1.0 / math.sqrt( - data.shape[ - 1]))), - act=active, - name=tag) + output = fluid.layers.fc( + input=data, + size=hidden_units, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal( + loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))), + act=active, + name=tag) return output @@ -62,43 +62,63 @@ class Model(ModelBase): deep_input = self._dense_data_var[1] label = self._sparse_data_var[0] - hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units", 75, self._namespace) - hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units", 50, self._namespace) - hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", 25, self._namespace) + hidden1_units = envs.get_global_env("hyper_parameters.hidden1_units", + 75, self._namespace) + hidden2_units = envs.get_global_env("hyper_parameters.hidden2_units", + 50, self._namespace) + hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", + 25, self._namespace) wide_output = self.wide_part(wide_input) - deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, hidden3_units) - - wide_model = fluid.layers.fc(input=wide_output, - size=1, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), - act=None, - name='w_wide') - - deep_model = fluid.layers.fc(input=deep_output, - size=1, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), - act=None, - name='w_deep') + deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, + hidden3_units) + + wide_model = fluid.layers.fc( + input=wide_output, + size=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal( + loc=0.0, scale=1.0)), + act=None, + name='w_wide') + + deep_model = fluid.layers.fc( + input=deep_output, + size=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal( + loc=0.0, scale=1.0)), + act=None, + name='w_deep') prediction = fluid.layers.elementwise_add(wide_model, deep_model) - pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction") + pred = fluid.layers.sigmoid( + fluid.layers.clip( + prediction, min=-15.0, max=15.0), + name="prediction") num_seqs = fluid.layers.create_tensor(dtype='int64') - acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=label, dtype='int64'), total=num_seqs) - auc_var, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=label, dtype='int64')) + acc = fluid.layers.accuracy( + input=pred, + label=fluid.layers.cast( + x=label, dtype='int64'), + total=num_seqs) + auc_var, batch_auc, auc_states = fluid.layers.auc( + input=pred, label=fluid.layers.cast( + x=label, dtype='int64')) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc self._metrics["ACC"] = acc - cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=fluid.layers.cast(label, dtype='float32')) + cost = fluid.layers.sigmoid_cross_entropy_with_logits( + x=prediction, label=fluid.layers.cast( + label, dtype='float32')) avg_cost = fluid.layers.mean(cost) self._cost = avg_cost def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer diff --git a/models/rank/xdeepfm/data/download.py b/models/rank/xdeepfm/data/download.py index 4afd1ce28ec1ba99006414c6b5116178b8b28142..e46a9ced4a69339f5c5f6c45067d34bbbfa39469 100755 --- a/models/rank/xdeepfm/data/download.py +++ b/models/rank/xdeepfm/data/download.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import shutil import sys diff --git a/models/rank/xdeepfm/data/get_slot_data.py b/models/rank/xdeepfm/data/get_slot_data.py index d71444135c2198b426638bc4b2665ec053acb2aa..4426e9647c080dce5debdcdbc3e039ac69a69935 100755 --- a/models/rank/xdeepfm/data/get_slot_data.py +++ b/models/rank/xdeepfm/data/get_slot_data.py @@ -21,6 +21,7 @@ except ImportError: import pickle import paddle.fluid.incubate.data_generator as dg + class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator): feat_idx, feat_value, label = self._process_line(line) s = "" - for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]: + for i in [('feat_idx', feat_idx), ('feat_value', feat_value), + ('label', label)]: k = i[0] v = i[1] for j in v: @@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator): return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/rank/xdeepfm/model.py b/models/rank/xdeepfm/model.py index 059e83d40290d713e9bc5f25f59fbb285e5a855a..d1045897d9cb4ca5b7018a1dcb9da726829c4744 100755 --- a/models/rank/xdeepfm/model.py +++ b/models/rank/xdeepfm/model.py @@ -28,18 +28,22 @@ class Model(ModelBase): loc=0.0, scale=init_value_) is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) - sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) + sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number", None, self._namespace) + sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None, self._namespace) # ------------------------- network input -------------------------- - num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) + num_field = envs.get_global_env("hyper_parameters.num_field", None, + self._namespace) raw_feat_idx = self._sparse_data_var[1] raw_feat_value = self._dense_data_var[0] self.label = self._sparse_data_var[0] feat_idx = raw_feat_idx - feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 + feat_value = fluid.layers.reshape( + raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 feat_embeddings = fluid.embedding( input=feat_idx, @@ -48,9 +52,9 @@ class Model(ModelBase): size=[sparse_feature_number + 1, sparse_feature_dim], padding_idx=0, param_attr=fluid.ParamAttr(initializer=initer)) - feat_embeddings = fluid.layers.reshape( - feat_embeddings, - [-1, num_field, sparse_feature_dim]) # None * num_field * embedding_size + feat_embeddings = fluid.layers.reshape(feat_embeddings, [ + -1, num_field, sparse_feature_dim + ]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size # -------------------- linear -------------------- @@ -73,7 +77,8 @@ class Model(ModelBase): # -------------------- CIN -------------------- - layer_sizes_cin = envs.get_global_env("hyper_parameters.layer_sizes_cin", None, self._namespace) + layer_sizes_cin = envs.get_global_env( + "hyper_parameters.layer_sizes_cin", None, self._namespace) Xs = [feat_embeddings] last_s = num_field for s in layer_sizes_cin: @@ -84,7 +89,8 @@ class Model(ModelBase): 1]) # None, embedding_size, num_field, 1 X_k = fluid.layers.reshape( fluid.layers.transpose(Xs[-1], [0, 2, 1]), - [-1, sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s + [-1, sparse_feature_dim, 1, + last_s]) # None, embedding_size, 1, last_s Z_k_1 = fluid.layers.matmul( X_0, X_k) # None, embedding_size, num_field, last_s @@ -124,16 +130,19 @@ class Model(ModelBase): # -------------------- DNN -------------------- - layer_sizes_dnn = envs.get_global_env("hyper_parameters.layer_sizes_dnn", None, self._namespace) - act = envs.get_global_env("hyper_parameters.act", None, self._namespace) + layer_sizes_dnn = envs.get_global_env( + "hyper_parameters.layer_sizes_dnn", None, self._namespace) + act = envs.get_global_env("hyper_parameters.act", None, + self._namespace) y_dnn = fluid.layers.reshape(feat_embeddings, [-1, num_field * sparse_feature_dim]) for s in layer_sizes_dnn: - y_dnn = fluid.layers.fc(input=y_dnn, - size=s, - act=act, - param_attr=fluid.ParamAttr(initializer=initer), - bias_attr=None) + y_dnn = fluid.layers.fc( + input=y_dnn, + size=s, + act=act, + param_attr=fluid.ParamAttr(initializer=initer), + bias_attr=None) y_dnn = fluid.layers.fc(input=y_dnn, size=1, act=None, @@ -148,7 +157,10 @@ class Model(ModelBase): self.model._init_slots() self.xdeepfm_net() - cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32"), epsilon=0.0000001) + cost = fluid.layers.log_loss( + input=self.predict, + label=fluid.layers.cast(self.label, "float32"), + epsilon=0.0000001) batch_cost = fluid.layers.reduce_mean(cost) self._cost = batch_cost @@ -162,7 +174,8 @@ class Model(ModelBase): self._metrics["BATCH_AUC"] = batch_auc_var def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer diff --git a/models/recall/__init__.py b/models/recall/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/recall/__init__.py +++ b/models/recall/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/recall/gnn/data_process.sh b/models/recall/gnn/data_process.sh index 38877b6906ecd65ef190aae5f1dcf5a74cece6d0..fc7ed827e0368c59cab8134d22f78e2200980f18 100755 --- a/models/recall/gnn/data_process.sh +++ b/models/recall/gnn/data_process.sh @@ -31,5 +31,3 @@ mv diginetica/train.txt train_data mkdir test_data mv diginetica/test.txt test_data - - diff --git a/models/recall/gnn/evaluate_reader.py b/models/recall/gnn/evaluate_reader.py index 904140c2febf5164592348d0b4e8f90f197bbf06..b26ea8fa9fc347ce402575104dcfa6de23aa80fc 100755 --- a/models/recall/gnn/evaluate_reader.py +++ b/models/recall/gnn/evaluate_reader.py @@ -23,7 +23,8 @@ from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - self.batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") + self.batch_size = envs.get_global_env("batch_size", None, + "evaluate.reader") self.input = [] self.length = None @@ -34,7 +35,8 @@ class EvaluateReader(Reader): with open(f, "r") as fin: for line in fin: line = line.strip().split('\t') - res.append(tuple([map(int, line[0].split(',')), int(line[1])])) + res.append( + tuple([map(int, line[0].split(',')), int(line[1])])) return res def make_data(self, cur_batch, batch_size): @@ -75,10 +77,8 @@ class EvaluateReader(Reader): u_deg_out[np.where(u_deg_out == 0)] = 1 adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose()) - seq_index.append( - [[id, np.where(node == i)[0][0]] for i in e[0]]) - last_index.append( - [id, np.where(node == e[0][last_id[id]])[0][0]]) + seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]]) + last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]]) label.append(e[1] - 1) mask.append([[1] * (last_id[id] + 1) + [0] * (max_seq_len - last_id[id] - 1)]) @@ -101,10 +101,13 @@ class EvaluateReader(Reader): def _reader(): random.shuffle(self.input) group_remain = self.length % batch_group_size - for bg_id in range(0, self.length - group_remain, batch_group_size): - cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size]) + for bg_id in range(0, self.length - group_remain, + batch_group_size): + cur_bg = copy.deepcopy(self.input[bg_id:bg_id + + batch_group_size]) if train: - cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True) + cur_bg = sorted( + cur_bg, key=lambda x: len(x[0]), reverse=True) for i in range(0, batch_group_size, batch_size): cur_batch = cur_bg[i:i + batch_size] yield self.make_data(cur_batch, batch_size) diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py index b98625a6afc094e106b26d1e2b31a8712a9d7b94..027fbb721131e203ed22485b4d8f9bd96b8ed3a3 100755 --- a/models/recall/gnn/model.py +++ b/models/recall/gnn/model.py @@ -30,15 +30,21 @@ class Model(ModelBase): def init_config(self): self._fetch_interval = 1 self.items_num, self.ins_num = self.config_read( - envs.get_global_env("hyper_parameters.config_path", None, self._namespace)) - self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader") - self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") - self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace) + envs.get_global_env("hyper_parameters.config_path", None, + self._namespace)) + self.train_batch_size = envs.get_global_env("batch_size", None, + "train.reader") + self.evaluate_batch_size = envs.get_global_env("batch_size", None, + "evaluate.reader") + self.hidden_size = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None, self._namespace) + self.step = envs.get_global_env( + "hyper_parameters.gnn_propogation_steps", None, self._namespace) def config_read(self, config_path=None): if config_path is None: - raise ValueError("please set train.model.hyper_parameters.config_path at first") + raise ValueError( + "please set train.model.hyper_parameters.config_path at first") with open(config_path, "r") as fin: item_nums = int(fin.readline().strip()) ins_nums = int(fin.readline().strip()) @@ -46,100 +52,108 @@ class Model(ModelBase): def input(self, bs): self.items = fluid.data( - name="items", - shape=[bs, -1], + name="items", shape=[bs, -1], dtype="int64") # [batch_size, uniq_max] self.seq_index = fluid.data( - name="seq_index", - shape=[bs, -1, 2], + name="seq_index", shape=[bs, -1, 2], dtype="int32") # [batch_size, seq_max, 2] self.last_index = fluid.data( - name="last_index", - shape=[bs, 2], - dtype="int32") # [batch_size, 2] + name="last_index", shape=[bs, 2], dtype="int32") # [batch_size, 2] self.adj_in = fluid.data( - name="adj_in", - shape=[bs, -1, -1], + name="adj_in", shape=[bs, -1, -1], dtype="float32") # [batch_size, seq_max, seq_max] self.adj_out = fluid.data( - name="adj_out", - shape=[bs, -1, -1], + name="adj_out", shape=[bs, -1, -1], dtype="float32") # [batch_size, seq_max, seq_max] self.mask = fluid.data( - name="mask", - shape=[bs, -1, 1], + name="mask", shape=[bs, -1, 1], dtype="float32") # [batch_size, seq_max, 1] self.label = fluid.data( - name="label", - shape=[bs, 1], - dtype="int64") # [batch_size, 1] + name="label", shape=[bs, 1], dtype="int64") # [batch_size, 1] - res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label] + res = [ + self.items, self.seq_index, self.last_index, self.adj_in, + self.adj_out, self.mask, self.label + ] return res def train_input(self): res = self.input(self.train_batch_size) self._data_var = res - use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", + False, self._namespace) if self._platform != "LINUX" or use_dataloader: self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) + feed_list=self._data_var, + capacity=256, + use_double_buffer=False, + iterable=False) def net(self, items_num, hidden_size, step, bs): stdv = 1.0 / math.sqrt(hidden_size) - def embedding_layer(input, table_name, emb_dim, initializer_instance=None): + def embedding_layer(input, + table_name, + emb_dim, + initializer_instance=None): emb = fluid.embedding( input=input, size=[items_num, emb_dim], param_attr=fluid.ParamAttr( - name=table_name, - initializer=initializer_instance), - ) + name=table_name, initializer=initializer_instance), ) return emb sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) - items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer) + items_emb = embedding_layer(self.items, "emb", hidden_size, + sparse_initializer) pre_state = items_emb for i in range(step): - pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) + pre_state = layers.reshape( + x=pre_state, shape=[bs, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv)), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) # [batch_size, uniq_max, h] + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-stdv, high=stdv)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv)), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) # [batch_size, uniq_max, h] + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-stdv, high=stdv)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-stdv, high=stdv))) # [batch_size, uniq_max, h] - state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h] - state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h] + state_adj_in = layers.matmul(self.adj_in, + state_in) # [batch_size, uniq_max, h] + state_adj_out = layers.matmul( + self.adj_out, state_out) # [batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) - gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) - gru_fc = layers.fc( - input=gru_input, - name="gru_fc", - size=3 * hidden_size, - bias_attr=False) + gru_input = layers.reshape( + x=gru_input, shape=[-1, hidden_size * 2]) + gru_fc = layers.fc(input=gru_input, + name="gru_fc", + size=3 * hidden_size, + bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit( input=gru_fc, - hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]), + hidden=layers.reshape( + x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) @@ -153,24 +167,22 @@ class Model(ModelBase): bias_attr=False, act=None, num_flatten_dims=2, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) # [batch_size, seq_max, h] - last_fc = layers.fc( - input=last, - name="last_fc", - size=hidden_size, - bias_attr=False, - act=None, - num_flatten_dims=1, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) # [bathc_size, h] + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-stdv, high=stdv))) # [batch_size, seq_max, h] + last_fc = layers.fc(input=last, + name="last_fc", + size=hidden_size, + bias_attr=False, + act=None, + num_flatten_dims=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-stdv, high=stdv))) # [bathc_size, h] seq_fc_t = layers.transpose( seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h] - add = layers.elementwise_add( - seq_fc_t, last_fc) # [seq_max, batch_size, h] + add = layers.elementwise_add(seq_fc_t, + last_fc) # [seq_max, batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', @@ -188,12 +200,13 @@ class Model(ModelBase): act=None, num_flatten_dims=2, bias_attr=False, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) # [batch_size, seq_max, 1] + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-stdv, high=stdv))) # [batch_size, seq_max, 1] weight *= self.mask - weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h] - global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h] + weight_mask = layers.elementwise_mul( + seq, weight, axis=0) # [batch_size, seq_max, h] + global_attention = layers.reduce_sum( + weight_mask, dim=1) # [batch_size, h] final_attention = layers.concat( [global_attention, last], axis=1) # [batch_size, 2*h] @@ -213,7 +226,8 @@ class Model(ModelBase): # persistable=True, # name="all_vocab") all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32') - all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64') + all_vocab = fluid.layers.cast( + x=fluid.layers.assign(all_vocab), dtype='int64') all_emb = fluid.embedding( input=all_vocab, @@ -240,15 +254,19 @@ class Model(ModelBase): def train_net(self): self.train_input() - self.net(self.items_num, self.hidden_size, self.step, self.train_batch_size) + self.net(self.items_num, self.hidden_size, self.step, + self.train_batch_size) self.avg_loss() self.metrics() def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) step_per_epoch = self.ins_num // self.train_batch_size - decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, self._namespace) - decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, self._namespace) + decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, + self._namespace) + decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, + self._namespace) l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace) optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( @@ -266,10 +284,14 @@ class Model(ModelBase): self._infer_data_var = res self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def infer_net(self): self.infer_input() - self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size) + self.net(self.items_num, self.hidden_size, self.step, + self.evaluate_batch_size) self._infer_results['acc'] = self.acc self._infer_results['loss'] = self.loss diff --git a/models/recall/gnn/raw_data/convert_data.py b/models/recall/gnn/raw_data/convert_data.py index 2e0e57f1f781f7210c46ef265e1189e99a6f7a96..dfe6bc49fcfca0b98ed5cb0ee9d41832dc5c2205 100755 --- a/models/recall/gnn/raw_data/convert_data.py +++ b/models/recall/gnn/raw_data/convert_data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import time import pickle @@ -10,6 +24,7 @@ parser.add_argument( help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample') opt = parser.parse_args() + def process_data(file_type): path = os.path.join(opt.data_dir, file_type) output_path = os.path.splitext(path)[0] + ".txt" @@ -23,6 +38,7 @@ def process_data(file_type): fout.write(str(data[i][1])) fout.write("\n") + process_data("train") process_data("test") diff --git a/models/recall/gnn/raw_data/download.py b/models/recall/gnn/raw_data/download.py index 69a1ee20b2d634e9eca47c621dce82ac2d98b5f2..9bebdf1b37e2cd45369c14bb7446c206de8017a0 100755 --- a/models/recall/gnn/raw_data/download.py +++ b/models/recall/gnn/raw_data/download.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import requests import sys import time diff --git a/models/recall/gnn/reader.py b/models/recall/gnn/reader.py index cffb45115ed6a3dd6232b34db8758ad6a20447e2..68170f09a7a7c84547a67f970b6e127de40b0ccc 100755 --- a/models/recall/gnn/reader.py +++ b/models/recall/gnn/reader.py @@ -23,7 +23,8 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.batch_size = envs.get_global_env("batch_size", None, "train.reader") + self.batch_size = envs.get_global_env("batch_size", None, + "train.reader") self.input = [] self.length = None @@ -34,7 +35,8 @@ class TrainReader(Reader): with open(f, "r") as fin: for line in fin: line = line.strip().split('\t') - res.append(tuple([map(int, line[0].split(',')), int(line[1])])) + res.append( + tuple([map(int, line[0].split(',')), int(line[1])])) return res def make_data(self, cur_batch, batch_size): @@ -75,10 +77,8 @@ class TrainReader(Reader): u_deg_out[np.where(u_deg_out == 0)] = 1 adj_out.append(np.divide(adj.transpose(), u_deg_out).transpose()) - seq_index.append( - [[id, np.where(node == i)[0][0]] for i in e[0]]) - last_index.append( - [id, np.where(node == e[0][last_id[id]])[0][0]]) + seq_index.append([[id, np.where(node == i)[0][0]] for i in e[0]]) + last_index.append([id, np.where(node == e[0][last_id[id]])[0][0]]) label.append(e[1] - 1) mask.append([[1] * (last_id[id] + 1) + [0] * (max_seq_len - last_id[id] - 1)]) @@ -101,10 +101,13 @@ class TrainReader(Reader): def _reader(): random.shuffle(self.input) group_remain = self.length % batch_group_size - for bg_id in range(0, self.length - group_remain, batch_group_size): - cur_bg = copy.deepcopy(self.input[bg_id:bg_id + batch_group_size]) + for bg_id in range(0, self.length - group_remain, + batch_group_size): + cur_bg = copy.deepcopy(self.input[bg_id:bg_id + + batch_group_size]) if train: - cur_bg = sorted(cur_bg, key=lambda x: len(x[0]), reverse=True) + cur_bg = sorted( + cur_bg, key=lambda x: len(x[0]), reverse=True) for i in range(0, batch_group_size, batch_size): cur_batch = cur_bg[i:i + batch_size] yield self.make_data(cur_batch, batch_size) diff --git a/models/recall/gru4rec/model.py b/models/recall/gru4rec/model.py index b79c7642201990efae56a640954154404bf2e606..6848f1e65d51c9d5e3f9890b3f3f148ef68829fc 100644 --- a/models/recall/gru4rec/model.py +++ b/models/recall/gru4rec/model.py @@ -24,14 +24,22 @@ class Model(ModelBase): def all_vocab_network(self, is_infer=False): """ network definition """ - recall_k = envs.get_global_env("hyper_parameters.recall_k", None, self._namespace) - vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) - hid_size = envs.get_global_env("hyper_parameters.hid_size", None, self._namespace) - init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound", None, self._namespace) - init_high_bound = envs.get_global_env("hyper_parameters.init_high_bound", None, self._namespace) - emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None, self._namespace) - gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None, self._namespace) - fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None, self._namespace) + recall_k = envs.get_global_env("hyper_parameters.recall_k", None, + self._namespace) + vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, + self._namespace) + hid_size = envs.get_global_env("hyper_parameters.hid_size", None, + self._namespace) + init_low_bound = envs.get_global_env("hyper_parameters.init_low_bound", + None, self._namespace) + init_high_bound = envs.get_global_env( + "hyper_parameters.init_high_bound", None, self._namespace) + emb_lr_x = envs.get_global_env("hyper_parameters.emb_lr_x", None, + self._namespace) + gru_lr_x = envs.get_global_env("hyper_parameters.gru_lr_x", None, + self._namespace) + fc_lr_x = envs.get_global_env("hyper_parameters.fc_lr_x", None, + self._namespace) # Input data src_wordseq = fluid.data( name="src_wordseq", shape=[None, 1], dtype="int64", lod_level=1) @@ -41,7 +49,10 @@ class Model(ModelBase): if is_infer: self._infer_data_var = [src_wordseq, dst_wordseq] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) emb = fluid.embedding( input=src_wordseq, @@ -56,7 +67,8 @@ class Model(ModelBase): size=hid_size * 3, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), + low=init_low_bound, + high=init_high_bound), learning_rate=gru_lr_x)) gru_h0 = fluid.layers.dynamic_gru( input=fc0, diff --git a/models/recall/ncf/model.py b/models/recall/ncf/model.py index be7c465dc75d7186f6d63a6d1fbf604f84945891..d2b7fa371be8f068e11e1dd37a63a90b55e96e65 100644 --- a/models/recall/ncf/model.py +++ b/models/recall/ncf/model.py @@ -25,9 +25,12 @@ class Model(ModelBase): ModelBase.__init__(self, config) def input_data(self, is_infer=False): - user_input = fluid.data(name="user_input", shape=[-1, 1], dtype="int64", lod_level=0) - item_input = fluid.data(name="item_input", shape=[-1, 1], dtype="int64", lod_level=0) - label = fluid.data(name="label", shape=[-1, 1], dtype="int64", lod_level=0) + user_input = fluid.data( + name="user_input", shape=[-1, 1], dtype="int64", lod_level=0) + item_input = fluid.data( + name="item_input", shape=[-1, 1], dtype="int64", lod_level=0) + label = fluid.data( + name="label", shape=[-1, 1], dtype="int64", lod_level=0) if is_infer: inputs = [user_input] + [item_input] else: @@ -35,81 +38,104 @@ class Model(ModelBase): self._data_var = inputs return inputs - + def net(self, inputs, is_infer=False): - num_users = envs.get_global_env("hyper_parameters.num_users", None, self._namespace) - num_items = envs.get_global_env("hyper_parameters.num_items", None, self._namespace) - latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None, self._namespace) - layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace) - - num_layer = len(layers) #Number of layers in the MLP - - MF_Embedding_User = fluid.embedding(input=inputs[0], - size=[num_users, latent_dim], - param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), - is_sparse=True) - MF_Embedding_Item = fluid.embedding(input=inputs[1], - size=[num_items, latent_dim], - param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), - is_sparse=True) - - MLP_Embedding_User = fluid.embedding(input=inputs[0], - size=[num_users, int(layers[0] / 2)], - param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), - is_sparse=True) - MLP_Embedding_Item = fluid.embedding(input=inputs[1], - size=[num_items, int(layers[0] / 2)], - param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), - is_sparse=True) - + num_users = envs.get_global_env("hyper_parameters.num_users", None, + self._namespace) + num_items = envs.get_global_env("hyper_parameters.num_items", None, + self._namespace) + latent_dim = envs.get_global_env("hyper_parameters.latent_dim", None, + self._namespace) + layers = envs.get_global_env("hyper_parameters.layers", None, + self._namespace) + + num_layer = len(layers) #Number of layers in the MLP + + MF_Embedding_User = fluid.embedding( + input=inputs[0], + size=[num_users, latent_dim], + param_attr=fluid.initializer.Normal( + loc=0.0, scale=0.01), + is_sparse=True) + MF_Embedding_Item = fluid.embedding( + input=inputs[1], + size=[num_items, latent_dim], + param_attr=fluid.initializer.Normal( + loc=0.0, scale=0.01), + is_sparse=True) + + MLP_Embedding_User = fluid.embedding( + input=inputs[0], + size=[num_users, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal( + loc=0.0, scale=0.01), + is_sparse=True) + MLP_Embedding_Item = fluid.embedding( + input=inputs[1], + size=[num_items, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal( + loc=0.0, scale=0.01), + is_sparse=True) + # MF part mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1) mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1) - mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent) - + mf_vector = fluid.layers.elementwise_mul(mf_user_latent, + mf_item_latent) + # MLP part # The 0-th layer is the concatenation of embedding layers mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1) mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1) - mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1) - + mlp_vector = fluid.layers.concat( + input=[mlp_user_latent, mlp_item_latent], axis=-1) + for i in range(1, num_layer): - mlp_vector = fluid.layers.fc(input=mlp_vector, - size=layers[i], - act='relu', - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])), - regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), - name='layer_' + str(i)) - + mlp_vector = fluid.layers.fc( + input=mlp_vector, + size=layers[i], + act='relu', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal( + loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4)), + name='layer_' + str(i)) + # Concatenate MF and MLP parts - predict_vector = fluid.layers.concat(input=[mf_vector, mlp_vector], axis=-1) + predict_vector = fluid.layers.concat( + input=[mf_vector, mlp_vector], axis=-1) # Final prediction layer - prediction = fluid.layers.fc(input=predict_vector, - size=1, - act='sigmoid', - param_attr=fluid.initializer.MSRAInitializer(uniform=True), - name='prediction') + prediction = fluid.layers.fc( + input=predict_vector, + size=1, + act='sigmoid', + param_attr=fluid.initializer.MSRAInitializer(uniform=True), + name='prediction') if is_infer: self._infer_results["prediction"] = prediction return - - cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32')) + + cost = fluid.layers.log_loss( + input=prediction, + label=fluid.layers.cast( + x=inputs[2], dtype='float32')) avg_cost = fluid.layers.mean(cost) - self._cost = avg_cost self._metrics["cost"] = avg_cost - def train_net(self): input_data = self.input_data() self.net(input_data) - def infer_net(self): self._infer_data_var = self.input_data(is_infer=True) self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) self.net(self._infer_data_var, is_infer=True) diff --git a/models/recall/ncf/movielens_infer_reader.py b/models/recall/ncf/movielens_infer_reader.py index 04f159962e89b28d0e044cfbbc1fcae5a15f3f0d..dc737aed2b8f93a5d4274938cf468e8d9240be04 100644 --- a/models/recall/ncf/movielens_infer_reader.py +++ b/models/recall/ncf/movielens_infer_reader.py @@ -33,7 +33,9 @@ class EvaluateReader(Reader): This function needs to be implemented by the user, based on data format """ features = line.strip().split(',') - + feature_name = ["user_input", "item_input"] - yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]]) + yield zip(feature_name, + [[int(features[0])]] + [[int(features[1])]]) + return reader diff --git a/models/recall/ncf/movielens_reader.py b/models/recall/ncf/movielens_reader.py index 789a71add824e9759734be3bc571ec2152e9f50c..add9b6397cef93f3a8f416f19c6847c41537fb5f 100644 --- a/models/recall/ncf/movielens_reader.py +++ b/models/recall/ncf/movielens_reader.py @@ -33,10 +33,9 @@ class TrainReader(Reader): This function needs to be implemented by the user, based on data format """ features = line.strip().split(',') - + feature_name = ["user_input", "item_input", "label"] - yield zip(feature_name, [[int(features[0])]] + [[int(features[1])]] + [[int(features[2])]]) + yield zip(feature_name, [[int(features[0])]] + + [[int(features[1])]] + [[int(features[2])]]) return reader - - diff --git a/models/recall/readme.md b/models/recall/readme.md index 664ced053934d461fb2ed4311a8fd4a1f4d9bd8a..421df1315dc22396f2ff3bb5aec99508435e2c8d 100755 --- a/models/recall/readme.md +++ b/models/recall/readme.md @@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn | MOVIELENS | NCF | 0.688 | -- | | -- | Youtube | -- | -- | | 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 | - diff --git a/models/recall/ssr/model.py b/models/recall/ssr/model.py index 2c4b7f190088cd7681720f83e3a53730b790d462..3abe3ae41514d97d46d86b52680076cf5932386c 100644 --- a/models/recall/ssr/model.py +++ b/models/recall/ssr/model.py @@ -79,9 +79,12 @@ class Model(ModelBase): return correct def train(self): - vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) - emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) - hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) + vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, + self._namespace) + emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, + self._namespace) + hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, + self._namespace) emb_shape = [vocab_size, emb_dim] self.user_encoder = GrnnEncoder() @@ -131,24 +134,34 @@ class Model(ModelBase): self.train() def infer(self): - vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) - emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) - hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) + vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, + self._namespace) + emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, + self._namespace) + hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, + self._namespace) user_data = fluid.data( name="user", shape=[None, 1], dtype="int64", lod_level=1) all_item_data = fluid.data( name="all_item", shape=[None, vocab_size], dtype="int64") - pos_label = fluid.data(name="pos_label", shape=[None, 1], dtype="int64") + pos_label = fluid.data( + name="pos_label", shape=[None, 1], dtype="int64") self._infer_data_var = [user_data, all_item_data, pos_label] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) user_emb = fluid.embedding( input=user_data, size=[vocab_size, emb_dim], param_attr="emb.item") all_item_emb = fluid.embedding( - input=all_item_data, size=[vocab_size, emb_dim], param_attr="emb.item") - all_item_emb_re = fluid.layers.reshape(x=all_item_emb, shape=[-1, emb_dim]) + input=all_item_data, + size=[vocab_size, emb_dim], + param_attr="emb.item") + all_item_emb_re = fluid.layers.reshape( + x=all_item_emb, shape=[-1, emb_dim]) user_encoder = GrnnEncoder() user_enc = user_encoder.forward(user_emb) @@ -156,7 +169,8 @@ class Model(ModelBase): size=hidden_size, param_attr='user.w', bias_attr="user.b") - user_exp = fluid.layers.expand(x=user_hid, expand_times=[1, vocab_size]) + user_exp = fluid.layers.expand( + x=user_hid, expand_times=[1, vocab_size]) user_re = fluid.layers.reshape(x=user_exp, shape=[-1, hidden_size]) all_item_hid = fluid.layers.fc(input=all_item_emb_re, diff --git a/models/recall/ssr/ssr_infer_reader.py b/models/recall/ssr/ssr_infer_reader.py index 18f3fc2f37236907801fb00047fd3b6da5b5fa8c..1f94b1d21fbd428282d3e9faecd09a590588fbc9 100644 --- a/models/recall/ssr/ssr_infer_reader.py +++ b/models/recall/ssr/ssr_infer_reader.py @@ -22,7 +22,8 @@ from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters") + self.vocab_size = envs.get_global_env("vocab_size", 10, + "train.model.hyper_parameters") def generate_sample(self, line): """ @@ -39,6 +40,9 @@ class EvaluateReader(Reader): src = conv_ids[:boundary] pos_tgt = [conv_ids[boundary]] feature_name = ["user", "all_item", "p_item"] - yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt]) + yield zip( + feature_name, + [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + + [pos_tgt]) return reader diff --git a/models/recall/word2vec/model.py b/models/recall/word2vec/model.py index bf09a04648a71a6618b99ef7de7d7244aaecbdba..fefc89043c2f926f37318e1094b9cdf98dd6235a 100755 --- a/models/recall/word2vec/model.py +++ b/models/recall/word2vec/model.py @@ -24,46 +24,57 @@ class Model(ModelBase): ModelBase.__init__(self, config) def input(self): - neg_num = int(envs.get_global_env( - "hyper_parameters.neg_num", None, self._namespace)) - self.input_word = fluid.data(name="input_word", shape=[ - None, 1], dtype='int64') - self.true_word = fluid.data(name='true_label', shape=[ - None, 1], dtype='int64') + neg_num = int( + envs.get_global_env("hyper_parameters.neg_num", None, + self._namespace)) + self.input_word = fluid.data( + name="input_word", shape=[None, 1], dtype='int64') + self.true_word = fluid.data( + name='true_label', shape=[None, 1], dtype='int64') self._data_var.append(self.input_word) self._data_var.append(self.true_word) - with_shuffle_batch = bool(int(envs.get_global_env( - "hyper_parameters.with_shuffle_batch", None, self._namespace))) + with_shuffle_batch = bool( + int( + envs.get_global_env("hyper_parameters.with_shuffle_batch", + None, self._namespace))) if not with_shuffle_batch: - self.neg_word = fluid.data(name="neg_label", shape=[ - None, neg_num], dtype='int64') + self.neg_word = fluid.data( + name="neg_label", shape=[None, neg_num], dtype='int64') self._data_var.append(self.neg_word) if self._platform != "LINUX": self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def net(self): is_distributed = True if envs.get_trainer() == "CtrTrainer" else False - neg_num = int(envs.get_global_env( - "hyper_parameters.neg_num", None, self._namespace)) + neg_num = int( + envs.get_global_env("hyper_parameters.neg_num", None, + self._namespace)) sparse_feature_number = envs.get_global_env( "hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_dim = envs.get_global_env( "hyper_parameters.sparse_feature_dim", None, self._namespace) - with_shuffle_batch = bool(int(envs.get_global_env( - "hyper_parameters.with_shuffle_batch", None, self._namespace))) + with_shuffle_batch = bool( + int( + envs.get_global_env("hyper_parameters.with_shuffle_batch", + None, self._namespace))) - def embedding_layer(input, table_name, emb_dim, initializer_instance=None, squeeze=False): + def embedding_layer(input, + table_name, + emb_dim, + initializer_instance=None, + squeeze=False): emb = fluid.embedding( input=input, is_sparse=True, is_distributed=is_distributed, size=[sparse_feature_number, emb_dim], param_attr=fluid.ParamAttr( - name=table_name, - initializer=initializer_instance), - ) + name=table_name, initializer=initializer_instance), ) if squeeze: return fluid.layers.squeeze(input=emb, axes=[1]) else: @@ -73,35 +84,38 @@ class Model(ModelBase): emb_initializer = fluid.initializer.Uniform(-init_width, init_width) emb_w_initializer = fluid.initializer.Constant(value=0.0) - input_emb = embedding_layer( - self.input_word, "emb", sparse_feature_dim, emb_initializer, True) - true_emb_w = embedding_layer( - self.true_word, "emb_w", sparse_feature_dim, emb_w_initializer, True) - true_emb_b = embedding_layer( - self.true_word, "emb_b", 1, emb_w_initializer, True) + input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim, + emb_initializer, True) + true_emb_w = embedding_layer(self.true_word, "emb_w", + sparse_feature_dim, emb_w_initializer, + True) + true_emb_b = embedding_layer(self.true_word, "emb_b", 1, + emb_w_initializer, True) if with_shuffle_batch: neg_emb_w_list = [] for i in range(neg_num): - neg_emb_w_list.append(fluid.contrib.layers.shuffle_batch( - true_emb_w)) # shuffle true_word + neg_emb_w_list.append( + fluid.contrib.layers.shuffle_batch( + true_emb_w)) # shuffle true_word neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0) neg_emb_w = fluid.layers.reshape( neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim]) neg_emb_b_list = [] for i in range(neg_num): - neg_emb_b_list.append(fluid.contrib.layers.shuffle_batch( - true_emb_b)) # shuffle true_word + neg_emb_b_list.append( + fluid.contrib.layers.shuffle_batch( + true_emb_b)) # shuffle true_word neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0) neg_emb_b_vec = fluid.layers.reshape( neg_emb_b, shape=[-1, neg_num]) else: - neg_emb_w = embedding_layer( - self.neg_word, "emb_w", sparse_feature_dim, emb_w_initializer) - neg_emb_b = embedding_layer( - self.neg_word, "emb_b", 1, emb_w_initializer) + neg_emb_w = embedding_layer(self.neg_word, "emb_w", + sparse_feature_dim, emb_w_initializer) + neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1, + emb_w_initializer) neg_emb_b_vec = fluid.layers.reshape( neg_emb_b, shape=[-1, neg_num]) @@ -117,7 +131,8 @@ class Model(ModelBase): neg_matmul = fluid.layers.matmul( input_emb_re, neg_emb_w, transpose_y=True) neg_logits = fluid.layers.elementwise_add( - fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]), + fluid.layers.reshape( + neg_matmul, shape=[-1, neg_num]), neg_emb_b_vec) label_ones = fluid.layers.fill_constant_batch_size_like( @@ -136,9 +151,17 @@ class Model(ModelBase): neg_xent, dim=1)) self.avg_cost = fluid.layers.reduce_mean(cost) global_right_cnt = fluid.layers.create_global_var( - name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0) + name="global_right_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) global_total_cnt = fluid.layers.create_global_var( - name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0) + name="global_total_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) global_right_cnt.stop_gradient = True global_total_cnt.stop_gradient = True @@ -155,12 +178,12 @@ class Model(ModelBase): self.metrics() def optimizer(self): - learning_rate = envs.get_global_env( - "hyper_parameters.learning_rate", None, self._namespace) - decay_steps = envs.get_global_env( - "hyper_parameters.decay_steps", None, self._namespace) - decay_rate = envs.get_global_env( - "hyper_parameters.decay_rate", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) + decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, + self._namespace) + decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, + self._namespace) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=learning_rate, @@ -180,11 +203,15 @@ class Model(ModelBase): name="analogy_c", shape=[None], dtype='int64') self.analogy_d = fluid.data( name="analogy_d", shape=[None], dtype='int64') - self._infer_data_var = [self.analogy_a, - self.analogy_b, self.analogy_c, self.analogy_d] + self._infer_data_var = [ + self.analogy_a, self.analogy_b, self.analogy_c, self.analogy_d + ] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def infer_net(self): sparse_feature_dim = envs.get_global_env( @@ -216,18 +243,28 @@ class Model(ModelBase): dist = fluid.layers.matmul( x=target, y=emb_all_label_l2, transpose_y=True) values, pred_idx = fluid.layers.topk(input=dist, k=4) - label = fluid.layers.expand(fluid.layers.unsqueeze( - self.analogy_d, axes=[1]), expand_times=[1, 4]) + label = fluid.layers.expand( + fluid.layers.unsqueeze( + self.analogy_d, axes=[1]), + expand_times=[1, 4]) label_ones = fluid.layers.fill_constant_batch_size_like( label, shape=[-1, 1], value=1.0, dtype='float32') - right_cnt = fluid.layers.reduce_sum( - input=fluid.layers.cast(fluid.layers.equal(pred_idx, label), dtype='float32')) + right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast( + fluid.layers.equal(pred_idx, label), dtype='float32')) total_cnt = fluid.layers.reduce_sum(label_ones) global_right_cnt = fluid.layers.create_global_var( - name="global_right_cnt", persistable=True, dtype='float32', shape=[1], value=0) + name="global_right_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) global_total_cnt = fluid.layers.create_global_var( - name="global_total_cnt", persistable=True, dtype='float32', shape=[1], value=0) + name="global_total_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) global_right_cnt.stop_gradient = True global_total_cnt.stop_gradient = True diff --git a/models/recall/word2vec/prepare_data.sh b/models/recall/word2vec/prepare_data.sh index 8b78eeedd94f088e206e35729a6b35d349b99039..cfd067350ce1d33112806ab72ca78222381a86f4 100755 --- a/models/recall/word2vec/prepare_data.sh +++ b/models/recall/word2vec/prepare_data.sh @@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta tar xzvf test_dir.tar -C raw_data mv raw_data/data/test_dir test_data/ rm -rf raw_data - - - diff --git a/models/recall/word2vec/preprocess.py b/models/recall/word2vec/preprocess.py index 9c9934e40589bdc700b7df5dc432d9b6dc92a8cc..6c9ee16cd2d136006dc10e7ce0c970974e8bf2b5 100755 --- a/models/recall/word2vec/preprocess.py +++ b/models/recall/word2vec/preprocess.py @@ -49,8 +49,7 @@ def parse_args(): '--file_nums', type=int, default=1024, - help="re-split input corpus file nums" - ) + help="re-split input corpus file nums") parser.add_argument( '--downsample', type=float, @@ -137,9 +136,11 @@ def filter_corpus(args): if not os.path.exists(args.output_corpus_dir): os.makedirs(args.output_corpus_dir) for file in os.listdir(args.input_corpus_dir): - with io.open(args.output_corpus_dir + '/convert_' + file + '.csv', "w") as wf: + with io.open(args.output_corpus_dir + '/convert_' + file + '.csv', + "w") as wf: with io.open( - args.input_corpus_dir + '/' + file, encoding='utf-8') as rf: + args.input_corpus_dir + '/' + file, + encoding='utf-8') as rf: print(args.input_corpus_dir + '/' + file) for line in rf: signal = False @@ -154,9 +155,9 @@ def filter_corpus(args): count_w = id_counts[idx] corpus_size = word_all_count keep_prob = ( - math.sqrt(count_w / - (args.downsample * corpus_size)) + 1 - ) * (args.downsample * corpus_size) / count_w + math.sqrt(count_w / + (args.downsample * corpus_size)) + 1 + ) * (args.downsample * corpus_size) / count_w r_value = random.random() if r_value > keep_prob: continue @@ -182,7 +183,8 @@ def build_dict(args): for file in os.listdir(args.build_dict_corpus_dir): with io.open( - args.build_dict_corpus_dir + "/" + file, encoding='utf-8') as f: + args.build_dict_corpus_dir + "/" + file, + encoding='utf-8') as f: print("build dict : ", args.build_dict_corpus_dir + "/" + file) for line in f: line = text_strip(line) @@ -232,7 +234,8 @@ def data_split(args): for i in range(1, num + 1): with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout: - data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))] + data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, + len(contents))] for line in data: fout.write(line) diff --git a/models/recall/word2vec/w2v_evaluate_reader.py b/models/recall/word2vec/w2v_evaluate_reader.py index 04be9d41b2cd1ec51768696817a57c38dd958a44..6350c960e61d8ef3580cc4cc605ba24cb5623b0b 100755 --- a/models/recall/word2vec/w2v_evaluate_reader.py +++ b/models/recall/word2vec/w2v_evaluate_reader.py @@ -22,7 +22,8 @@ from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader") + dict_path = envs.get_global_env("word_id_dict_path", None, + "evaluate.reader") self.word_to_id = dict() self.id_to_word = dict() with io.open(dict_path, 'r', encoding='utf-8') as f: @@ -68,14 +69,17 @@ class EvaluateReader(Reader): a unicode string - a space-delimited sequence of words. """ return u" ".join([ - word if word in original_vocab else u"" for word in line.split() + word if word in original_vocab else u"" + for word in line.split() ]) def generate_sample(self, line): def reader(): features = self.strip_lines(line.lower(), self.word_to_id) features = features.split() - yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), - ('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])] + yield [('analogy_a', [self.word_to_id[features[0]]]), + ('analogy_b', [self.word_to_id[features[1]]]), + ('analogy_c', [self.word_to_id[features[2]]]), + ('analogy_d', [self.word_to_id[features[3]]])] return reader diff --git a/models/recall/word2vec/w2v_reader.py b/models/recall/word2vec/w2v_reader.py index 88e52b47692778feef8396dd037448a8053aa958..9b3e69127055118bbc16b30eaac63f9a282bd1eb 100755 --- a/models/recall/word2vec/w2v_reader.py +++ b/models/recall/word2vec/w2v_reader.py @@ -40,10 +40,14 @@ class NumpyRandomInt(object): class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader") - self.window_size = envs.get_global_env("hyper_parameters.window_size", None, "train.model") - self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, "train.model") - self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch", None, "train.model") + dict_path = envs.get_global_env("word_count_dict_path", None, + "train.reader") + self.window_size = envs.get_global_env("hyper_parameters.window_size", + None, "train.model") + self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, + "train.model") + self.with_shuffle_batch = envs.get_global_env( + "hyper_parameters.with_shuffle_batch", None, "train.model") self.random_generator = NumpyRandomInt(1, self.window_size + 1) self.cs = None @@ -81,13 +85,15 @@ class TrainReader(Reader): def reader(): word_ids = [w for w in line.split()] for idx, target_id in enumerate(word_ids): - context_word_ids = self.get_context_words( - word_ids, idx) + context_word_ids = self.get_context_words(word_ids, idx) for context_id in context_word_ids: - output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])] + output = [('input_word', [int(target_id)]), + ('true_label', [int(context_id)])] if not self.with_shuffle_batch: - neg_array = self.cs.searchsorted(np.random.sample(self.neg_num)) - output += [('neg_label', [int(str(i)) for i in neg_array])] + neg_array = self.cs.searchsorted( + np.random.sample(self.neg_num)) + output += [('neg_label', + [int(str(i)) for i in neg_array])] yield output return reader diff --git a/models/recall/youtube_dnn/model.py b/models/recall/youtube_dnn/model.py index 63d1fd2f49aad3c59272e560ed64442ab5f2f41e..22953764d1f81218b2f3d4c232392fe741043fa3 100644 --- a/models/recall/youtube_dnn/model.py +++ b/models/recall/youtube_dnn/model.py @@ -25,14 +25,20 @@ class Model(ModelBase): ModelBase.__init__(self, config) def input_data(self, is_infer=False): - - watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, self._namespace) - search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, self._namespace) - other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, self._namespace) - - watch_vec = fluid.data(name="watch_vec", shape=[None, watch_vec_size], dtype="float32") - search_vec = fluid.data(name="search_vec", shape=[None, search_vec_size], dtype="float32") - other_feat = fluid.data(name="other_feat", shape=[None, other_feat_size], dtype="float32") + + watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", + None, self._namespace) + search_vec_size = envs.get_global_env( + "hyper_parameters.search_vec_size", None, self._namespace) + other_feat_size = envs.get_global_env( + "hyper_parameters.other_feat_size", None, self._namespace) + + watch_vec = fluid.data( + name="watch_vec", shape=[None, watch_vec_size], dtype="float32") + search_vec = fluid.data( + name="search_vec", shape=[None, search_vec_size], dtype="float32") + other_feat = fluid.data( + name="other_feat", shape=[None, other_feat_size], dtype="float32") label = fluid.data(name="label", shape=[None, 1], dtype="int64") inputs = [watch_vec] + [search_vec] + [other_feat] + [label] self._data_var = inputs @@ -41,27 +47,32 @@ class Model(ModelBase): def fc(self, tag, data, out_dim, active='relu'): init_stddev = 1.0 - scales = 1.0 / np.sqrt(data.shape[1]) - + scales = 1.0 / np.sqrt(data.shape[1]) + if tag == 'l4': - p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales)) + p_attr = fluid.param_attr.ParamAttr( + name='%s_weight' % tag, + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=init_stddev * scales)) else: p_attr = None - - b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) + + b_attr = fluid.ParamAttr( + name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) out = fluid.layers.fc(input=data, - size=out_dim, - act=active, - param_attr=p_attr, - bias_attr =b_attr, - name=tag) + size=out_dim, + act=active, + param_attr=p_attr, + bias_attr=b_attr, + name=tag) return out def net(self, inputs): - output_size = envs.get_global_env("hyper_parameters.output_size", None, self._namespace) - layers = envs.get_global_env("hyper_parameters.layers", None, self._namespace) + output_size = envs.get_global_env("hyper_parameters.output_size", None, + self._namespace) + layers = envs.get_global_env("hyper_parameters.layers", None, + self._namespace) concat_feats = fluid.layers.concat(input=inputs[:-1], axis=-1) l1 = self.fc('l1', concat_feats, layers[0], 'relu') diff --git a/models/recall/youtube_dnn/random_reader.py b/models/recall/youtube_dnn/random_reader.py index 723c66f9c0fe94d8fe0d36e6a3e75e9945768d40..30df6d1d29cfdf75c7e7cf9b68643af582c9f49f 100644 --- a/models/recall/youtube_dnn/random_reader.py +++ b/models/recall/youtube_dnn/random_reader.py @@ -21,10 +21,14 @@ import numpy as np class TrainReader(Reader): def init(self): - self.watch_vec_size = envs.get_global_env("hyper_parameters.watch_vec_size", None, "train.model") - self.search_vec_size = envs.get_global_env("hyper_parameters.search_vec_size", None, "train.model") - self.other_feat_size = envs.get_global_env("hyper_parameters.other_feat_size", None, "train.model") - self.output_size = envs.get_global_env("hyper_parameters.output_size", None, "train.model") + self.watch_vec_size = envs.get_global_env( + "hyper_parameters.watch_vec_size", None, "train.model") + self.search_vec_size = envs.get_global_env( + "hyper_parameters.search_vec_size", None, "train.model") + self.other_feat_size = envs.get_global_env( + "hyper_parameters.other_feat_size", None, "train.model") + self.output_size = envs.get_global_env("hyper_parameters.output_size", + None, "train.model") def generate_sample(self, line): """ @@ -35,13 +39,12 @@ class TrainReader(Reader): """ This function needs to be implemented by the user, based on data format """ - + feature_name = ["watch_vec", "search_vec", "other_feat", "label"] - yield zip(feature_name, [np.random.rand(self.watch_vec_size).tolist()] + - [np.random.rand(self.search_vec_size).tolist()] + - [np.random.rand(self.other_feat_size).tolist()] + - [[np.random.randint(self.output_size)]] ) + yield zip(feature_name, + [np.random.rand(self.watch_vec_size).tolist()] + + [np.random.rand(self.search_vec_size).tolist()] + + [np.random.rand(self.other_feat_size).tolist()] + + [[np.random.randint(self.output_size)]]) return reader - - diff --git a/models/treebased/README.md b/models/treebased/README.md index 8a8317d17be5148f0652b7944442e1929a7684af..3ceb13b62eba8127aa0394397d141b2abe343a32 100644 --- a/models/treebased/README.md +++ b/models/treebased/README.md @@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效 - 如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。 - 训练数据如何组织?答:tdm的训练数据主要为:`user/query emb` 加 `item`的正样本,`item`需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的`tdm-sampler op`完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。 - 大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。 -3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。 \ No newline at end of file +3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。 diff --git a/models/treebased/__init__.py b/models/treebased/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/models/treebased/__init__.py +++ b/models/treebased/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/treebased/tdm/__init__.py b/models/treebased/tdm/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/treebased/tdm/__init__.py +++ b/models/treebased/tdm/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/treebased/tdm/model.py b/models/treebased/tdm/model.py index fa5f225f68068f826df6fc9ef0c7c9d35dbd9b89..319a7b4f9a3695537b43c8f1078dc4e1b73549fb 100755 --- a/models/treebased/tdm/model.py +++ b/models/treebased/tdm/model.py @@ -25,38 +25,38 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) # tree meta hyper parameters - self.max_layers = envs.get_global_env( - "tree_parameters.max_layers", 4, self._namespace) - self.node_nums = envs.get_global_env( - "tree_parameters.node_nums", 26, self._namespace) + self.max_layers = envs.get_global_env("tree_parameters.max_layers", 4, + self._namespace) + self.node_nums = envs.get_global_env("tree_parameters.node_nums", 26, + self._namespace) self.leaf_node_nums = envs.get_global_env( "tree_parameters.leaf_node_nums", 13, self._namespace) self.output_positive = envs.get_global_env( "tree_parameters.output_positive", True, self._namespace) self.layer_node_num_list = envs.get_global_env( - "tree_parameters.layer_node_num_list", [ - 2, 4, 7, 12], self._namespace) - self.child_nums = envs.get_global_env( - "tree_parameters.child_nums", 2, self._namespace) - self.tree_layer_path = envs.get_global_env( - "tree.tree_layer_path", None, "train.startup") + "tree_parameters.layer_node_num_list", [2, 4, 7, + 12], self._namespace) + self.child_nums = envs.get_global_env("tree_parameters.child_nums", 2, + self._namespace) + self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", + None, "train.startup") # model training hyper parameter self.node_emb_size = envs.get_global_env( "hyper_parameters.node_emb_size", 64, self._namespace) self.input_emb_size = envs.get_global_env( "hyper_parameters.input_emb_size", 768, self._namespace) - self.act = envs.get_global_env( - "hyper_parameters.act", "tanh", self._namespace) + self.act = envs.get_global_env("hyper_parameters.act", "tanh", + self._namespace) self.neg_sampling_list = envs.get_global_env( - "hyper_parameters.neg_sampling_list", [ - 1, 2, 3, 4], self._namespace) + "hyper_parameters.neg_sampling_list", [1, 2, 3, + 4], self._namespace) # model infer hyper parameter - self.topK = envs.get_global_env( - "hyper_parameters.node_nums", 1, self._namespace) - self.batch_size = envs.get_global_env( - "batch_size", 1, "evaluate.reader") + self.topK = envs.get_global_env("hyper_parameters.node_nums", 1, + self._namespace) + self.batch_size = envs.get_global_env("batch_size", 1, + "evaluate.reader") def train_net(self): self.train_input() @@ -76,21 +76,22 @@ class Model(ModelBase): input_emb = fluid.data( name="input_emb", shape=[None, self.input_emb_size], - dtype="float32", - ) + dtype="float32", ) self._data_var.append(input_emb) item_label = fluid.data( name="item_label", shape=[None, 1], - dtype="int64", - ) + dtype="int64", ) self._data_var.append(item_label) if self._platform != "LINUX": self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def tdm_net(self): """ @@ -116,8 +117,7 @@ class Model(ModelBase): output_list=True, seed=0, tree_dtype='int64', - dtype='int64' - ) + dtype='int64') # 查表得到每个节点的Embedding sample_nodes_emb = [ @@ -125,35 +125,34 @@ class Model(ModelBase): input=sample_nodes[i], is_sparse=True, size=[self.node_nums, self.node_emb_size], - param_attr=fluid.ParamAttr( - name="TDM_Tree_Emb") - ) for i in range(self.max_layers) + param_attr=fluid.ParamAttr(name="TDM_Tree_Emb")) + for i in range(self.max_layers) ] # 此处进行Reshape是为了之后层次化的分类器训练 sample_nodes_emb = [ - fluid.layers.reshape(sample_nodes_emb[i], - [-1, self.neg_sampling_list[i] + - self.output_positive, self.node_emb_size] - ) for i in range(self.max_layers) + fluid.layers.reshape(sample_nodes_emb[i], [ + -1, self.neg_sampling_list[i] + self.output_positive, + self.node_emb_size + ]) for i in range(self.max_layers) ] # 对输入的input_emb进行转换,使其维度与node_emb维度一致 input_trans_emb = self.input_trans_layer(input_emb) # 分类器的主体网络,分别训练不同层次的分类器 - layer_classifier_res = self.classifier_layer( - input_trans_emb, sample_nodes_emb) + layer_classifier_res = self.classifier_layer(input_trans_emb, + sample_nodes_emb) # 最后的概率判别FC,将所有层次的node分类结果放到一起以相同的标准进行判别 # 考虑到树极大可能不平衡,有些item不在最后一层,所以需要这样的机制保证每个item都有机会被召回 - tdm_fc = fluid.layers.fc(input=layer_classifier_res, - size=2, - act=None, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name="tdm.cls_fc.weight"), - bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias")) + tdm_fc = fluid.layers.fc( + input=layer_classifier_res, + size=2, + act=None, + num_flatten_dims=2, + param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"), + bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias")) # 将loss打平,放到一起计算整体网络的loss tdm_fc_re = fluid.layers.reshape(tdm_fc, [-1, 2]) @@ -202,7 +201,7 @@ class Model(ModelBase): def metrics(self): auc, batch_auc, _ = fluid.layers.auc(input=self._predict, label=self.mask_label, - num_thresholds=2 ** 12, + num_thresholds=2**12, slide_steps=20) self._metrics["AUC"] = auc self._metrics["BATCH_AUC"] = batch_auc @@ -218,8 +217,7 @@ class Model(ModelBase): size=self.node_emb_size, act=None, param_attr=fluid.ParamAttr(name="trans.input_fc.weight"), - bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), - ) + bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), ) # 将input_emb映射到各个不同层次的向量表示空间 input_layer_fc_out = [ @@ -229,8 +227,9 @@ class Model(ModelBase): act=self.act, param_attr=fluid.ParamAttr( name="trans.layer_fc.weight." + str(i)), - bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)), - ) for i in range(self.max_layers) + bias_attr=fluid.ParamAttr( + name="trans.layer_fc.bias." + str(i)), ) + for i in range(self.max_layers) ] return input_layer_fc_out @@ -246,20 +245,22 @@ class Model(ModelBase): input_layer_unsequeeze, expand_times=[1, node.shape[1], 1]) else: input_layer_expand = fluid.layers.expand( - input_layer_unsequeeze, expand_times=[1, node[layer_idx].shape[1], 1]) + input_layer_unsequeeze, + expand_times=[1, node[layer_idx].shape[1], 1]) return input_layer_expand def classifier_layer(self, input, node): # 扩展input,使维度与node匹配 input_expand = [ - self._expand_layer(input[i], node, i) for i in range(self.max_layers) + self._expand_layer(input[i], node, i) + for i in range(self.max_layers) ] # 将input_emb与node_emb concat到一起过分类器FC input_node_concat = [ fluid.layers.concat( - input=[input_expand[i], node[i]], - axis=2) for i in range(self.max_layers) + input=[input_expand[i], node[i]], axis=2) + for i in range(self.max_layers) ] hidden_states_fc = [ fluid.layers.fc( @@ -269,8 +270,8 @@ class Model(ModelBase): act=self.act, param_attr=fluid.ParamAttr( name="cls.concat_fc.weight." + str(i)), - bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)) - ) for i in range(self.max_layers) + bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i))) + for i in range(self.max_layers) ] # 如果将所有层次的node放到一起计算loss,则需要在此处concat @@ -285,12 +286,14 @@ class Model(ModelBase): input_emb = fluid.layers.data( name="input_emb", shape=[self.input_emb_size], - dtype="float32", - ) + dtype="float32", ) self._infer_data_var.append(input_emb) self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) def get_layer_list(self): """get layer list from layer_list.txt""" @@ -318,10 +321,12 @@ class Model(ModelBase): node_list = [] mask_list = [] for id in first_layer_node: - node_list.append(fluid.layers.fill_constant( - [self.batch_size, 1], value=int(id), dtype='int64')) - mask_list.append(fluid.layers.fill_constant( - [self.batch_size, 1], value=0, dtype='int64')) + node_list.append( + fluid.layers.fill_constant( + [self.batch_size, 1], value=int(id), dtype='int64')) + mask_list.append( + fluid.layers.fill_constant( + [self.batch_size, 1], value=0, dtype='int64')) self.first_layer_node = fluid.layers.concat(node_list, axis=1) self.first_layer_node_mask = fluid.layers.concat(mask_list, axis=1) @@ -359,28 +364,26 @@ class Model(ModelBase): size=[self.node_nums, self.node_emb_size], param_attr=fluid.ParamAttr(name="TDM_Tree_Emb")) - input_fc_out = self.layer_fc_infer( - input_trans_emb, layer_idx) + input_fc_out = self.layer_fc_infer(input_trans_emb, layer_idx) # 过每一层的分类器 - layer_classifier_res = self.classifier_layer_infer(input_fc_out, - node_emb, - layer_idx) + layer_classifier_res = self.classifier_layer_infer( + input_fc_out, node_emb, layer_idx) # 过最终的判别分类器 - tdm_fc = fluid.layers.fc(input=layer_classifier_res, - size=2, - act=None, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name="tdm.cls_fc.weight"), - bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias")) + tdm_fc = fluid.layers.fc( + input=layer_classifier_res, + size=2, + act=None, + num_flatten_dims=2, + param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"), + bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias")) prob = fluid.layers.softmax(tdm_fc) positive_prob = fluid.layers.slice( prob, axes=[2], starts=[1], ends=[2]) - prob_re = fluid.layers.reshape( - positive_prob, [-1, current_layer_node_num]) + prob_re = fluid.layers.reshape(positive_prob, + [-1, current_layer_node_num]) # 过滤掉padding产生的无效节点(node_id=0) node_zero_mask = fluid.layers.cast(current_layer_node, 'bool') @@ -395,11 +398,11 @@ class Model(ModelBase): # index_sample op根据下标索引tensor对应位置的值 # 若paddle版本>2.0,调用方式为paddle.index_sample - top_node = fluid.contrib.layers.index_sample( - current_layer_node, topk_i) + top_node = fluid.contrib.layers.index_sample(current_layer_node, + topk_i) prob_re_mask = prob_re * current_layer_node_mask # 过滤掉非叶子节点 - topk_value = fluid.contrib.layers.index_sample( - prob_re_mask, topk_i) + topk_value = fluid.contrib.layers.index_sample(prob_re_mask, + topk_i) node_score.append(topk_value) node_list.append(top_node) @@ -424,7 +427,8 @@ class Model(ModelBase): res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1]) # 利用Tree_info信息,将node_id转换为item_id - tree_info = fluid.default_main_program().global_block().var("TDM_Tree_Info") + tree_info = fluid.default_main_program().global_block().var( + "TDM_Tree_Info") res_node_emb = fluid.layers.gather_nd(tree_info, res_node) res_item = fluid.layers.slice( @@ -442,8 +446,7 @@ class Model(ModelBase): size=self.node_emb_size, act=None, param_attr=fluid.ParamAttr(name="trans.input_fc.weight"), - bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), - ) + bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), ) return input_fc_out def layer_fc_infer(self, input_fc_out, layer_idx): @@ -458,8 +461,7 @@ class Model(ModelBase): param_attr=fluid.ParamAttr( name="trans.layer_fc.weight." + str(layer_idx)), bias_attr=fluid.ParamAttr( - name="trans.layer_fc.bias." + str(layer_idx)), - ) + name="trans.layer_fc.bias." + str(layer_idx)), ) return input_layer_fc_out def classifier_layer_infer(self, input, node, layer_idx): @@ -480,5 +482,6 @@ class Model(ModelBase): act=self.act, param_attr=fluid.ParamAttr( name="cls.concat_fc.weight." + str(layer_idx)), - bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx))) + bias_attr=fluid.ParamAttr( + name="cls.concat_fc.bias." + str(layer_idx))) return hidden_states_fc diff --git a/models/treebased/tdm/tree/layer_list.txt b/models/treebased/tdm/tree/layer_list.txt index d8606bc601202390bd9aa54197fac8f34e3c5b59..d1c6c50a10f1b40aa1fbdef7d57bdd600549fb11 100755 --- a/models/treebased/tdm/tree/layer_list.txt +++ b/models/treebased/tdm/tree/layer_list.txt @@ -1,4 +1,4 @@ 1,2 3,4,5,6 7,8,9,10,11,12,13 -14,15,16,17,18,19,20,21,22,23,24,25 \ No newline at end of file +14,15,16,17,18,19,20,21,22,23,24,25 diff --git a/run.py b/run.py index 56999935f21bc1de2b2bc7b4a080da023559174a..c80c647d0c8bab5cd9918429f9bf460b6093335d 100755 --- a/run.py +++ b/run.py @@ -26,8 +26,10 @@ from paddlerec.core.utils import util engines = {} device = ["CPU", "GPU"] clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"] -engine_choices = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER", - "TDM_SINGLE", "TDM_LOCAL_CLUSTER", "TDM_CLUSTER"] +engine_choices = [ + "SINGLE", "LOCAL_CLUSTER", "CLUSTER", "TDM_SINGLE", "TDM_LOCAL_CLUSTER", + "TDM_CLUSTER" +] custom_model = ['TDM'] model_name = "" @@ -66,7 +68,8 @@ def get_engine(args): engine = engine.upper() if engine not in engine_choices: - raise ValueError("train.engin can not be chosen in {}".format(engine_choices)) + raise ValueError("train.engin can not be chosen in {}".format( + engine_choices)) print("engines: \n{}".format(engines)) @@ -77,8 +80,10 @@ def get_engine(args): def get_transpiler(): FNULL = open(os.devnull, 'w') - cmd = ["python", "-c", - "import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"] + cmd = [ + "python", "-c", + "import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];" + ] proc = subprocess.Popen(cmd, stdout=FNULL, stderr=FNULL, cwd=os.getcwd()) ret = proc.wait() if ret == -11: @@ -152,7 +157,8 @@ def cluster_engine(args): update_workspace(flattens) envs.set_runtime_environs(flattens) - print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"))) + print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value" + ))) launch = ClusterEngine(None, args.model) return launch @@ -163,7 +169,8 @@ def cluster_engine(args): cluster_envs = {} cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" - cluster_envs["train.trainer.threads"] = envs.get_runtime_environ("CPU_NUM") + cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( + "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) @@ -181,7 +188,8 @@ def cluster_engine(args): def cluster_mpi_engine(args): - print("launch cluster engine with cluster to run model: {}".format(args.model)) + print("launch cluster engine with cluster to run model: {}".format( + args.model)) cluster_envs = {} cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer" @@ -209,7 +217,8 @@ def local_cluster_engine(args): cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" - print("launch {} engine with cluster to run model: {}".format(trainer, args.model)) + print("launch {} engine with cluster to run model: {}".format(trainer, + args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) @@ -217,10 +226,12 @@ def local_cluster_engine(args): def local_mpi_engine(args): - print("launch cluster engine with cluster to run model: {}".format(args.model)) + print("launch cluster engine with cluster to run model: {}".format( + args.model)) from paddlerec.core.engine.local_mpi import LocalMPIEngine - print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(args.model)) + print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format( + args.model)) mpi = util.run_which("mpirun") if not mpi: diff --git a/setup.py b/setup.py index c655c37576e310fac825bd1cc01dfca5d051d18c..31bb34f03187dc9ab29c4cc5c75c559540ca8269 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ setup for paddle-rec. """ @@ -22,11 +21,7 @@ from setuptools import setup, find_packages import shutil import tempfile - -requires = [ - "paddlepaddle == 1.7.2", - "pyyaml >= 5.1.1" -] +requires = ["paddlepaddle == 1.7.2", "pyyaml >= 5.1.1"] about = {} about["__title__"] = "paddle-rec" @@ -48,18 +43,27 @@ def build(dirname): package_dir = os.path.dirname(os.path.abspath(__file__)) run_cmd("cp -r {}/* {}".format(package_dir, dirname)) run_cmd("mkdir {}".format(os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec"))) packages = find_packages(dirname, include=('paddlerec.*')) package_dir = {'': dirname} package_data = {} - models_copy = ['data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'tree/*.txt'] + models_copy = [ + 'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', + 'tree/*.txt' + ] engine_copy = ['*/*.sh'] for package in packages: if package.startswith("paddlerec.models."): @@ -80,8 +84,7 @@ def build(dirname): package_data=package_data, python_requires=">=2.7", install_requires=requires, - zip_safe=False - ) + zip_safe=False) dirname = tempfile.mkdtemp() diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/__init__.py b/tools/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/build_script.sh b/tools/build_script.sh new file mode 100755 index 0000000000000000000000000000000000000000..6fa779fac7b7e99f203d64fe69d339469f19d3bf --- /dev/null +++ b/tools/build_script.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#================================================= +# Utils +#================================================= + +set -ex + +function init() { + RED='\033[0;31m' + BLUE='\033[0;34m' + BOLD='\033[1m' + NONE='\033[0m' + + ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" +} + +function check_style() { + set -e + + export PATH=/usr/bin:$PATH + pre-commit install + + if ! pre-commit run -a; then + git diff + exit 1 + fi + + exit 0 +} + +function main() { + local CMD=$1 + init + case $CMD in + check_style) + check_style + ;; + *) + echo "build failed" + exit 1 + ;; + esac + echo "check_style finished as expected" +} + +main $@ diff --git a/tools/codestyle/copyright.hook b/tools/codestyle/copyright.hook new file mode 100644 index 0000000000000000000000000000000000000000..23aaf38f6f9b97220a55b29c7d0e800fb1e86105 --- /dev/null +++ b/tools/codestyle/copyright.hook @@ -0,0 +1,121 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import io, re +import sys, os +import subprocess +import platform + +COPYRIGHT = ''' +Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +LANG_COMMENT_MARK = None + +NEW_LINE_MARK = None + +COPYRIGHT_HEADER = None + +if platform.system() == "Windows": + NEW_LINE_MARK = "\r\n" +else: + NEW_LINE_MARK = '\n' + COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1] + p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0) + process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE) + date, err = process.communicate() + date = date.decode("utf-8").rstrip("\n") + COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date) + + +def generate_copyright(template, lang='C'): + if lang == 'Python': + LANG_COMMENT_MARK = '#' + else: + LANG_COMMENT_MARK = "//" + + lines = template.split(NEW_LINE_MARK) + BLANK = " " + ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK + for lino, line in enumerate(lines): + if lino == 0 or lino == 1 or lino == len(lines) - 1: continue + if len(line) == 0: + BLANK = "" + else: + BLANK = " " + ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK + + return ans + "\n" + + +def lang_type(filename): + if filename.endswith(".py"): + return "Python" + elif filename.endswith(".h"): + return "C" + elif filename.endswith(".c"): + return "C" + elif filename.endswith(".hpp"): + return "C" + elif filename.endswith(".cc"): + return "C" + elif filename.endswith(".cpp"): + return "C" + elif filename.endswith(".cu"): + return "C" + elif filename.endswith(".cuh"): + return "C" + elif filename.endswith(".go"): + return "C" + elif filename.endswith(".proto"): + return "C" + else: + print("Unsupported filetype %s", filename) + exit(0) + + +PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)") + + +def main(argv=None): + parser = argparse.ArgumentParser( + description='Checker for copyright declaration.') + parser.add_argument('filenames', nargs='*', help='Filenames to check') + args = parser.parse_args(argv) + + retv = 0 + for filename in args.filenames: + fd = io.open(filename, encoding="utf-8") + first_line = fd.readline() + second_line = fd.readline() + if "COPYRIGHT (C)" in first_line.upper(): continue + if first_line.startswith("#!") or PYTHON_ENCODE.match( + second_line) != None or PYTHON_ENCODE.match(first_line) != None: + continue + original_contents = io.open(filename, encoding="utf-8").read() + new_contents = generate_copyright( + COPYRIGHT, lang_type(filename)) + original_contents + print('Auto Insert Copyright Header {}'.format(filename)) + retv = 1 + with io.open(filename, 'w') as output_file: + output_file.write(new_contents) + + return retv + + +if __name__ == '__main__': + exit(main()) diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook new file mode 100644 index 0000000000000000000000000000000000000000..150a3f5666bd39d30b7e6518e58a14fb5fe2f14b --- /dev/null +++ b/tools/codestyle/pylint_pre_commit.hook @@ -0,0 +1,19 @@ +#!/bin/bash + +TOTAL_ERRORS=0 + + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +export PYTHONPATH=$DIR:$PYTHONPATH + +# The trick to remove deleted files: https://stackoverflow.com/a/2413151 +for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do + pylint --disable=all --load-plugins=docstring_checker \ + --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file; + TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); +done + +exit $TOTAL_ERRORS +#For now, just warning: +#exit 0 +