diff --git a/core/engine/cluster/cloud/__init__.py b/core/engine/cluster/cloud/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/core/engine/cluster/cloud/__init__.py +++ b/core/engine/cluster/cloud/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/coding/layers.py b/core/modules/coding/layers.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/core/modules/coding/layers.py +++ b/core/modules/coding/layers.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/trainers/__init__.py b/core/trainers/__init__.py index cd9c9db5e6b93fd6171bca0a5b0f97f69306aedc..f14704cad8f3859746f95353ba68753f857ff78d 100755 --- a/core/trainers/__init__.py +++ b/core/trainers/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ trainer implement. @@ -22,5 +21,3 @@ Trainer ↘ (for online learning training) OnlineLearningTrainer """ - - diff --git a/core/trainers/ctr_coding_trainer.py b/core/trainers/ctr_coding_trainer.py index 3bfec28cfd149bdbe47fdc202107c7ed7af58fdd..7dc51f340147aec933ce8bffd0be080b7be984c6 100755 --- a/core/trainers/ctr_coding_trainer.py +++ b/core/trainers/ctr_coding_trainer.py @@ -59,8 +59,10 @@ class CtrTrainer(Trainer): reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') - pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", self._config_yaml) - train_data_path = envs.get_global_env("train_data_path", None, namespace) + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", + self._config_yaml) + train_data_path = envs.get_global_env("train_data_path", None, + namespace) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) @@ -87,7 +89,8 @@ class CtrTrainer(Trainer): self.model.train_net() optimizer = self.model.optimizer() - optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False}) + optimizer = fleet.distributed_optimizer( + optimizer, strategy={"use_cvm": False}) optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): @@ -118,16 +121,18 @@ class CtrTrainer(Trainer): gs = shuf * 0 fleet._role_maker._node_type_comm.Allreduce(shuf, gs) - print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index(), fleet.worker_num(), gs)) + print("trainer id: {}, trainers: {}, gs: {}".format(fleet.worker_index( + ), fleet.worker_num(), gs)) epochs = envs.get_global_env("train.epochs") for i in range(epochs): - self._exe.train_from_dataset(program=fluid.default_main_program(), - dataset=dataset, - fetch_list=self.fetch_vars, - fetch_info=self.fetch_alias, - print_period=self.fetch_period) + self._exe.train_from_dataset( + program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) context['status'] = 'terminal_pass' fleet.stop_worker() diff --git a/core/trainers/ctr_modul_trainer.py b/core/trainers/ctr_modul_trainer.py index 7b3bd7874359059c3b03289cc10da7d7756ac35b..af8f3f3a2c3fb59fc6db60e3e4cd050ca3d8ad8a 100755 --- a/core/trainers/ctr_modul_trainer.py +++ b/core/trainers/ctr_modul_trainer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import datetime import json import sys @@ -23,7 +22,6 @@ import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker - from paddlerec.core.utils import fs as fs from paddlerec.core.utils import util as util from paddlerec.core.metrics.auc_metrics import AUCMetric @@ -80,20 +78,31 @@ class CtrTrainer(Trainer): """R """ Trainer.__init__(self, config) - config['output_path'] = util.get_absolute_path( - config['output_path'], config['io']['afs']) + config['output_path'] = util.get_absolute_path(config['output_path'], + config['io']['afs']) self.global_config = config self._metrics = {} self._path_generator = util.PathGenerator({ - 'templates': [ - {'name': 'xbox_base_done', 'template': config['output_path'] + '/xbox_base_done.txt'}, - {'name': 'xbox_delta_done', 'template': config['output_path'] + '/xbox_patch_done.txt'}, - {'name': 'xbox_base', 'template': config['output_path'] + '/xbox/{day}/base/'}, - {'name': 'xbox_delta', 'template': config['output_path'] + '/xbox/{day}/delta-{pass_id}/'}, - {'name': 'batch_model', 'template': config['output_path'] + '/batch_model/{day}/{pass_id}/'} - ] + 'templates': [{ + 'name': 'xbox_base_done', + 'template': config['output_path'] + '/xbox_base_done.txt' + }, { + 'name': 'xbox_delta_done', + 'template': config['output_path'] + '/xbox_patch_done.txt' + }, { + 'name': 'xbox_base', + 'template': config['output_path'] + '/xbox/{day}/base/' + }, { + 'name': 'xbox_delta', + 'template': + config['output_path'] + '/xbox/{day}/delta-{pass_id}/' + }, { + 'name': 'batch_model', + 'template': + config['output_path'] + '/batch_model/{day}/{pass_id}/' + }] }) if 'path_generator' in config: self._path_generator.add_path_template(config['path_generator']) @@ -111,9 +120,11 @@ class CtrTrainer(Trainer): if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu': afs_config = self.global_config['io']['afs'] role_maker = GeneralRoleMaker( - hdfs_name=afs_config['fs_name'], hdfs_ugi=afs_config['fs_ugi'], + hdfs_name=afs_config['fs_name'], + hdfs_ugi=afs_config['fs_ugi'], path=self.global_config['output_path'] + "/gloo", - init_timeout_seconds=1200, run_timeout_seconds=1200) + init_timeout_seconds=1200, + run_timeout_seconds=1200) fleet.init(role_maker) data_var_list = [] data_var_name_dict = {} @@ -125,7 +136,8 @@ class CtrTrainer(Trainer): scope = fluid.Scope() self._exector_context[executor['name']] = {} self._exector_context[executor['name']]['scope'] = scope - self._exector_context[executor['name']]['model'] = model_basic.create(executor) + self._exector_context[executor['name']][ + 'model'] = model_basic.create(executor) model = self._exector_context[executor['name']]['model'] self._metrics.update(model.get_metrics()) runnnable_scope.append(scope) @@ -146,9 +158,12 @@ class CtrTrainer(Trainer): model = self._exector_context[executor['name']]['model'] program = model._build_param['model']['train_program'] if not executor['is_update_sparse']: - program._fleet_opt["program_configs"][str(id(model.get_avg_cost().block.program))]["push_sparse"] = [] + program._fleet_opt["program_configs"][str( + id(model.get_avg_cost().block.program))][ + "push_sparse"] = [] if 'train_thread_num' not in executor: - executor['train_thread_num'] = self.global_config['train_thread_num'] + executor['train_thread_num'] = self.global_config[ + 'train_thread_num'] with fluid.scope_guard(scope): self._exe.run(model._build_param['model']['startup_program']) model.dump_model_program('./') @@ -162,7 +177,8 @@ class CtrTrainer(Trainer): dataset_item['data_vars'] = data_var_list dataset_item.update(self.global_config['io']['afs']) dataset_item["batch_size"] = self.global_config['batch_size'] - self._dataset[dataset_item['name']] = dataset.FluidTimeSplitDataset(dataset_item) + self._dataset[dataset_item[ + 'name']] = dataset.FluidTimeSplitDataset(dataset_item) # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass: # util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3) fleet.init_worker() @@ -190,23 +206,30 @@ class CtrTrainer(Trainer): metric_param = {'label': metric, 'metric_dict': metrics[metric]} metric_calculator.calculate(scope, metric_param) metric_result = metric_calculator.get_result_to_string() - self.print_log(metric_result, {'master': True, 'stdout': stdout_str}) + self.print_log(metric_result, + {'master': True, + 'stdout': stdout_str}) monitor_data += metric_result metric_calculator.clear(scope, metric_param) def save_model(self, day, pass_index, base_key): """R """ - cost_printer = util.CostPrinter(util.print_cost, - {'master': True, 'log_format': 'save model cost %s sec'}) - model_path = self._path_generator.generate_path('batch_model', {'day': day, 'pass_id': pass_index}) + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'save model cost %s sec' + }) + model_path = self._path_generator.generate_path( + 'batch_model', {'day': day, + 'pass_id': pass_index}) save_mode = 0 # just save all if pass_index < 1: # batch_model save_mode = 3 # unseen_day++, save all util.rank0_print("going to save_model %s" % model_path) fleet.save_persistables(None, model_path, mode=save_mode) if fleet._role_maker.is_first_worker(): - self._train_pass.save_train_progress(day, pass_index, base_key, model_path, is_checkpoint=True) + self._train_pass.save_train_progress( + day, pass_index, base_key, model_path, is_checkpoint=True) cost_printer.done() return model_path @@ -225,46 +248,58 @@ class CtrTrainer(Trainer): if pass_index < 1: save_mode = 2 xbox_patch_id = xbox_base_key - model_path = self._path_generator.generate_path('xbox_base', {'day': day}) - xbox_model_donefile = self._path_generator.generate_path('xbox_base_done', {'day': day}) + model_path = self._path_generator.generate_path('xbox_base', + {'day': day}) + xbox_model_donefile = self._path_generator.generate_path( + 'xbox_base_done', {'day': day}) else: save_mode = 1 - model_path = self._path_generator.generate_path('xbox_delta', {'day': day, 'pass_id': pass_index}) - xbox_model_donefile = self._path_generator.generate_path('xbox_delta_done', {'day': day}) - total_save_num = fleet.save_persistables(None, model_path, mode=save_mode) + model_path = self._path_generator.generate_path( + 'xbox_delta', {'day': day, + 'pass_id': pass_index}) + xbox_model_donefile = self._path_generator.generate_path( + 'xbox_delta_done', {'day': day}) + total_save_num = fleet.save_persistables( + None, model_path, mode=save_mode) cost_printer.done() - cost_printer = util.CostPrinter(util.print_cost, {'master': True, - 'log_format': 'save cache model cost %s sec', - 'stdout': stdout_str}) + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'save cache model cost %s sec', + 'stdout': stdout_str + }) model_file_handler = fs.FileHandler(self.global_config['io']['afs']) if self.global_config['save_cache_model']: - cache_save_num = fleet.save_cache_model(None, model_path, mode=save_mode) + cache_save_num = fleet.save_cache_model( + None, model_path, mode=save_mode) model_file_handler.write( "file_prefix:part\npart_num:16\nkey_num:%d\n" % cache_save_num, model_path + '/000_cache/sparse_cache.meta', 'w') cost_printer.done() - util.rank0_print("save xbox cache model done, key_num=%s" % cache_save_num) + util.rank0_print("save xbox cache model done, key_num=%s" % + cache_save_num) - save_env_param = { - 'executor': self._exe, - 'save_combine': True - } - cost_printer = util.CostPrinter(util.print_cost, {'master': True, - 'log_format': 'save dense model cost %s sec', - 'stdout': stdout_str}) + save_env_param = {'executor': self._exe, 'save_combine': True} + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'save dense model cost %s sec', + 'stdout': stdout_str + }) if fleet._role_maker.is_first_worker(): for executor in self.global_config['executor']: if 'layer_for_inference' not in executor: continue executor_name = executor['name'] model = self._exector_context[executor_name]['model'] - save_env_param['inference_list'] = executor['layer_for_inference'] - save_env_param['scope'] = self._exector_context[executor_name]['scope'] + save_env_param['inference_list'] = executor[ + 'layer_for_inference'] + save_env_param['scope'] = self._exector_context[executor_name][ + 'scope'] model.dump_inference_param(save_env_param) for dnn_layer in executor['layer_for_inference']: model_file_handler.cp(dnn_layer['save_file_name'], - model_path + '/dnn_plugin/' + dnn_layer['save_file_name']) + model_path + '/dnn_plugin/' + + dnn_layer['save_file_name']) fleet._role_maker._barrier_worker() cost_printer.done() @@ -282,9 +317,15 @@ class CtrTrainer(Trainer): "job_name": util.get_env_value("JOB_NAME") } if fleet._role_maker.is_first_worker(): - model_file_handler.write(json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a') + model_file_handler.write( + json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a') if pass_index > 0: - self._train_pass.save_train_progress(day, pass_index, xbox_base_key, model_path, is_checkpoint=False) + self._train_pass.save_train_progress( + day, + pass_index, + xbox_base_key, + model_path, + is_checkpoint=False) fleet._role_maker._barrier_worker() return stdout_str @@ -301,21 +342,28 @@ class CtrTrainer(Trainer): util.rank0_print("Begin " + executor_name + " pass") begin = time.time() program = model._build_param['model']['train_program'] - self._exe.train_from_dataset(program, dataset, scope, - thread=executor_config['train_thread_num'], debug=self.global_config['debug']) + self._exe.train_from_dataset( + program, + dataset, + scope, + thread=executor_config['train_thread_num'], + debug=self.global_config['debug']) end = time.time() local_cost = (end - begin) / 60.0 avg_cost = worker_numric_avg(local_cost) min_cost = worker_numric_min(local_cost) max_cost = worker_numric_max(local_cost) - util.rank0_print("avg train time %s mins, min %s mins, max %s mins" % (avg_cost, min_cost, max_cost)) + util.rank0_print("avg train time %s mins, min %s mins, max %s mins" + % (avg_cost, min_cost, max_cost)) self._exector_context[executor_name]['cost'] = max_cost monitor_data = "" self.print_global_metrics(scope, model, monitor_data, stdout_str) util.rank0_print("End " + executor_name + " pass") - if self._train_pass.need_dump_inference(pass_id) and executor_config['dump_inference_model']: - stdout_str += self.save_xbox_model(day, pass_id, xbox_base_key, monitor_data) + if self._train_pass.need_dump_inference( + pass_id) and executor_config['dump_inference_model']: + stdout_str += self.save_xbox_model(day, pass_id, xbox_base_key, + monitor_data) fleet._role_maker._barrier_worker() def startup(self, context): @@ -328,10 +376,14 @@ class CtrTrainer(Trainer): stdout_str = "" self._train_pass = util.TimeTrainPass(self.global_config) if not self.global_config['cold_start']: - cost_printer = util.CostPrinter(util.print_cost, - {'master': True, 'log_format': 'load model cost %s sec', - 'stdout': stdout_str}) - self.print_log("going to load model %s" % self._train_pass._checkpoint_model_path, {'master': True}) + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'load model cost %s sec', + 'stdout': stdout_str + }) + self.print_log("going to load model %s" % + self._train_pass._checkpoint_model_path, + {'master': True}) # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= self._train_pass.date() # and config.reqi_dnn_plugin_pass >= self._pass_id: # fleet.load_one_table(0, self._train_pass._checkpoint_model_path) @@ -340,9 +392,12 @@ class CtrTrainer(Trainer): cost_printer.done() if self.global_config['save_first_base']: self.print_log("save_first_base=True", {'master': True}) - self.print_log("going to save xbox base model", {'master': True, 'stdout': stdout_str}) + self.print_log("going to save xbox base model", + {'master': True, + 'stdout': stdout_str}) self._train_pass._base_key = int(time.time()) - stdout_str += self.save_xbox_model(self._train_pass.date(), 0, self._train_pass._base_key, "") + stdout_str += self.save_xbox_model(self._train_pass.date(), 0, + self._train_pass._base_key, "") context['status'] = 'begin_day' def begin_day(self, context): @@ -353,7 +408,9 @@ class CtrTrainer(Trainer): context['is_exit'] = True day = self._train_pass.date() pass_id = self._train_pass._pass_id - self.print_log("======== BEGIN DAY:%s ========" % day, {'master': True, 'stdout': stdout_str}) + self.print_log("======== BEGIN DAY:%s ========" % day, + {'master': True, + 'stdout': stdout_str}) if pass_id == self._train_pass.max_pass_num_day(): context['status'] = 'end_day' else: @@ -368,8 +425,10 @@ class CtrTrainer(Trainer): context['status'] = 'begin_day' util.rank0_print("shrink table") - cost_printer = util.CostPrinter(util.print_cost, - {'master': True, 'log_format': 'shrink table done, cost %s sec'}) + cost_printer = util.CostPrinter(util.print_cost, { + 'master': True, + 'log_format': 'shrink table done, cost %s sec' + }) fleet.shrink_sparse_table() for executor in self._exector_context: self._exector_context[executor]['model'].shrink({ @@ -394,7 +453,9 @@ class CtrTrainer(Trainer): pass_id = self._train_pass._pass_id base_key = self._train_pass._base_key pass_time = self._train_pass._current_train_time.strftime("%Y%m%d%H%M") - self.print_log(" ==== begin delta:%s ========" % pass_id, {'master': True, 'stdout': stdout_str}) + self.print_log(" ==== begin delta:%s ========" % pass_id, + {'master': True, + 'stdout': stdout_str}) train_begin_time = time.time() cost_printer = util.CostPrinter(util.print_cost, \ @@ -403,35 +464,46 @@ class CtrTrainer(Trainer): current_dataset = {} for name in self._dataset: current_dataset[name] = self._dataset[name].load_dataset({ - 'node_num': fleet.worker_num(), 'node_idx': fleet.worker_index(), - 'begin_time': pass_time, 'time_window_min': self._train_pass._interval_per_pass + 'node_num': fleet.worker_num(), + 'node_idx': fleet.worker_index(), + 'begin_time': pass_time, + 'time_window_min': self._train_pass._interval_per_pass }) fleet._role_maker._barrier_worker() cost_printer.done() util.rank0_print("going to global shuffle") cost_printer = util.CostPrinter(util.print_cost, { - 'master': True, 'stdout': stdout_str, - 'log_format': 'global shuffle done, cost %s sec'}) + 'master': True, + 'stdout': stdout_str, + 'log_format': 'global shuffle done, cost %s sec' + }) for name in current_dataset: - current_dataset[name].global_shuffle(fleet, self.global_config['dataset']['shuffle_thread']) + current_dataset[name].global_shuffle( + fleet, self.global_config['dataset']['shuffle_thread']) cost_printer.done() # str(dataset.get_shuffle_data_size(fleet)) fleet._role_maker._barrier_worker() if self.global_config['prefetch_data']: - next_pass_time = (self._train_pass._current_train_time + - datetime.timedelta(minutes=self._train_pass._interval_per_pass)).strftime("%Y%m%d%H%M") + next_pass_time = ( + self._train_pass._current_train_time + datetime.timedelta( + minutes=self._train_pass._interval_per_pass) + ).strftime("%Y%m%d%H%M") for name in self._dataset: self._dataset[name].preload_dataset({ - 'node_num': fleet.worker_num(), 'node_idx': fleet.worker_index(), - 'begin_time': next_pass_time, 'time_window_min': self._train_pass._interval_per_pass + 'node_num': fleet.worker_num(), + 'node_idx': fleet.worker_index(), + 'begin_time': next_pass_time, + 'time_window_min': self._train_pass._interval_per_pass }) fleet._role_maker._barrier_worker() pure_train_begin = time.time() for executor in self.global_config['executor']: - self.run_executor(executor, current_dataset[executor['dataset_name']], stdout_str) + self.run_executor(executor, + current_dataset[executor['dataset_name']], + stdout_str) cost_printer = util.CostPrinter(util.print_cost, \ {'master': True, 'log_format': 'release_memory cost %s sec'}) for name in current_dataset: @@ -444,9 +516,11 @@ class CtrTrainer(Trainer): train_end_time = time.time() train_cost = train_end_time - train_begin_time other_cost = train_cost - pure_train_cost - log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % (day, pass_id, train_cost) + log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % ( + day, pass_id, train_cost) for executor in self._exector_context: - log_str += '[' + executor + ':' + str(self._exector_context[executor]['cost']) + ']' + log_str += '[' + executor + ':' + str(self._exector_context[ + executor]['cost']) + ']' log_str += '[other_cost:' + str(other_cost) + ']' util.rank0_print(log_str) stdout_str += util.now_time_str() + log_str diff --git a/doc/benchmark.md b/doc/benchmark.md index b16e26c71888d590f00f13782449acf840c4b6ee..2aaea25d6941043d24fcee31c7117a3b34c4f525 100644 --- a/doc/benchmark.md +++ b/doc/benchmark.md @@ -1,2 +1,2 @@ # PaddleRec Benchmark -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/contribute.md b/doc/contribute.md index a9bd1910021e78573f1d9fd99f66404b77927737..26770d8ac0b64e9835f7398768d0f81de9383132 100644 --- a/doc/contribute.md +++ b/doc/contribute.md @@ -1,2 +1,2 @@ # PaddleRec 贡献代码 -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/design.md b/doc/design.md index 2975d77f14e461547921f74b9ced5cf73703e2e7..a442bd16a25301178538f482cd537a4ca23bc395 100644 --- a/doc/design.md +++ b/doc/design.md @@ -279,4 +279,4 @@ class Metric(object): pass ``` -全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py) \ No newline at end of file +全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py) diff --git a/doc/distributed_train.md b/doc/distributed_train.md index 425f141ab76e173a9484dff90cd5cfb55acaf853..339c5a83ffd26f9416a67a02390a11ba4c87c29d 100644 --- a/doc/distributed_train.md +++ b/doc/distributed_train.md @@ -7,5 +7,3 @@ ### K8S集群运行分布式 > 占位 - - diff --git a/doc/faq.md b/doc/faq.md index f7ca7cc4a7c366a2a828496eae3f12d1dea17b7e..60790140877b6b11add29552e02c0a435da75f87 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -1,2 +1,2 @@ # 常见问题FAQ -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/local_train.md b/doc/local_train.md index 4a43fa5520ca745badc4d2a49710763eac6e7a0a..e65255ebf7e14933f52f9977b2ecec48dabbb76e 100644 --- a/doc/local_train.md +++ b/doc/local_train.md @@ -1,2 +1,2 @@ # PaddleRec 单机训练 -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/model_list.md b/doc/model_list.md index 9e68d9f6d2f8e9361cc13b9e76f28426062943bc..b46687a60475fbd309f01050194510b21b060f17 100644 --- a/doc/model_list.md +++ b/doc/model_list.md @@ -12,4 +12,3 @@ | 多任务 | [ESMM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [DSSM]() | ✓ | x | ✓ | x | ✓ | ✓ | | 匹配 | [Multiview-Simnet]() | ✓ | x | ✓ | x | ✓ | ✓ | - diff --git a/doc/optimization_model.md b/doc/optimization_model.md index b516f8958053b1b2bd6982b71c699e8baf69f8d9..e63f45b62b50db55f1c6c0d48c7ca23b016b74d3 100644 --- a/doc/optimization_model.md +++ b/doc/optimization_model.md @@ -1,2 +1,2 @@ # PaddleRec 模型调参 -> 占位 \ No newline at end of file +> 占位 diff --git a/doc/predict.md b/doc/predict.md index a33eda43ec6aed8ebe628f0540327b707055970d..07160e1f0e7563276c33e514d006dd3747492f90 100644 --- a/doc/predict.md +++ b/doc/predict.md @@ -1 +1 @@ -# PaddleRec 离线预测 \ No newline at end of file +# PaddleRec 离线预测 diff --git a/doc/ps_background.md b/doc/ps_background.md index 984e1b00c96242843cceaf68cf15bb8deb52c391..e5f2e320940763986351fefd21a5e1f1363b6104 100644 --- a/doc/ps_background.md +++ b/doc/ps_background.md @@ -5,4 +5,3 @@ ## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839) - diff --git a/models/contentunderstanding/__init__.py b/models/contentunderstanding/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/contentunderstanding/__init__.py +++ b/models/contentunderstanding/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/contentunderstanding/classification/config.yaml b/models/contentunderstanding/classification/config.yaml index d1748137f0c4d994b3a566debf43dbdc2c3d66dc..ef55cd18e8fd45829acd2f479c661f27decfda71 100644 --- a/models/contentunderstanding/classification/config.yaml +++ b/models/contentunderstanding/classification/config.yaml @@ -37,4 +37,3 @@ train: dirname: "inference" epoch_interval: 100 save_last: True - diff --git a/models/contentunderstanding/classification/model.py b/models/contentunderstanding/classification/model.py index 9e853aa01d4a0b6bd5c7a20d8e13164bd9905ad0..23c51d44d7d839d9db30f8129c3e42449a6a80d4 100644 --- a/models/contentunderstanding/classification/model.py +++ b/models/contentunderstanding/classification/model.py @@ -31,7 +31,8 @@ class Model(ModelBase): def train_net(self): """ network definition """ - data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64') + data = fluid.data( + name="input", shape=[None, self.max_len], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') @@ -51,7 +52,9 @@ class Model(ModelBase): # full connect layer fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim) # softmax layer - prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax") + prediction = fluid.layers.fc(input=[fc_1], + size=self.class_dim, + act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) diff --git a/models/contentunderstanding/classification/reader.py b/models/contentunderstanding/classification/reader.py index 136a5668856c0fb558a016a3bc3a0b8a56651d3b..1c8e86cdb49f1cc89c9c4f413cbd7b117b55aa55 100644 --- a/models/contentunderstanding/classification/reader.py +++ b/models/contentunderstanding/classification/reader.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import sys from paddlerec.core.reader import Reader @@ -38,7 +37,8 @@ class TrainReader(Reader): data = [int(i) for i in data] label = [int(i) for i in label] seq_len = [int(i) for i in seq_len] - print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) + print >> sys.stderr, str( + [('data', data), ('label', label), ('seq_len', seq_len)]) yield [('data', data), ('label', label), ('seq_len', seq_len)] return data_iter diff --git a/models/contentunderstanding/readme.md b/models/contentunderstanding/readme.md index 07ec96f2414881998617f048c1e82a3f0d9cda75..deefbd2eb02f08d7fac810eb40ae78ff1a173baf 100644 --- a/models/contentunderstanding/readme.md +++ b/models/contentunderstanding/readme.md @@ -87,4 +87,3 @@ python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification | :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: | | ag news dataset | TagSpace | -- | -- | -- | -- | | -- | Classification | -- | -- | -- | -- | - diff --git a/models/contentunderstanding/tagspace/config.yaml b/models/contentunderstanding/tagspace/config.yaml index 70333fcbf7edf4b6b5f54145e29cb122ed3ae9c6..19fbf277d66445c44287856512cb0b13777dc251 100644 --- a/models/contentunderstanding/tagspace/config.yaml +++ b/models/contentunderstanding/tagspace/config.yaml @@ -47,4 +47,3 @@ train: dirname: "inference" epoch_interval: 100 save_last: True - diff --git a/models/match/__init__.py b/models/match/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/match/__init__.py +++ b/models/match/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/match/readme.md b/models/match/readme.md index 6bccc109ff14582e816dee64b72b786a1e90f49e..d9f91b257d81ffde820a04cad49b56edbd903f6a 100755 --- a/models/match/readme.md +++ b/models/match/readme.md @@ -37,4 +37,3 @@ python -m paddlerec.run -m paddlerec.models.match.dssm # dssm python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet ``` - diff --git a/models/multitask/__init__.py b/models/multitask/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/multitask/__init__.py +++ b/models/multitask/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/multitask/readme.md b/models/multitask/readme.md index d234f42f146e18bf254e518db0e78acc1e1d3e10..10e0641060f74b67b4987d14a1c4aad27a25b103 100755 --- a/models/multitask/readme.md +++ b/models/multitask/readme.md @@ -56,4 +56,3 @@ python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm | Census-income Data | Share-Bottom | -- | 0.93120/0.99256 | | Census-income Data | MMoE | -- | 0.94465/0.99324 | | Ali-CCP | ESMM | -- | 0.97181/0.49967 | - diff --git a/models/rank/__init__.py b/models/rank/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/models/rank/__init__.py +++ b/models/rank/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/rank/dcn/data/download.py b/models/rank/dcn/data/download.py index d9bcc6df296068cfd5cd9fc1c91165f11b580d04..4203a3868a577757930ae848736c34bb4da376c7 100755 --- a/models/rank/dcn/data/download.py +++ b/models/rank/dcn/data/download.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import sys import io diff --git a/models/rank/dcn/data/get_slot_data.py b/models/rank/dcn/data/get_slot_data.py index 77b30296ab9ba99757039f053c6133fd175c2811..96d4448214d6a87092495326646a279657079f45 100755 --- a/models/rank/dcn/data/get_slot_data.py +++ b/models/rank/dcn/data/get_slot_data.py @@ -26,8 +26,8 @@ from collections import Counter import os import paddle.fluid.incubate.data_generator as dg -class TrainReader(dg.MultiSlotDataGenerator): +class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -83,11 +83,11 @@ class TrainReader(dg.MultiSlotDataGenerator): if idx == 2 else math.log(1 + float(features[idx]))) for idx in self.cat_idx_: if features[idx] == '' or features[ - idx] not in self.cat_feat_idx_dict_list[idx - 14]: + idx] not in self.cat_feat_idx_dict_list[idx - 14]: label_feat_list[idx].append(0) else: label_feat_list[idx].append(self.cat_feat_idx_dict_list[ - idx - 14][features[idx]]) + idx - 14][features[idx]]) label_feat_list[0].append(int(features[0])) return label_feat_list @@ -109,6 +109,7 @@ class TrainReader(dg.MultiSlotDataGenerator): return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/rank/dcn/data/preprocess.py b/models/rank/dcn/data/preprocess.py index b356607729eedd73854a77449ffda3cc3bb8050f..9a89df10ef42dcfa09faad66f409b21439f340a8 100755 --- a/models/rank/dcn/data/preprocess.py +++ b/models/rank/dcn/data/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function, absolute_import, division import os diff --git a/models/rank/deepfm/data/download_preprocess.py b/models/rank/deepfm/data/download_preprocess.py index e8c94cc64728a5e3ae38a29bf419fc90b55df597..7a504b4f88e49d8b4f242d4d6b56f6f168464e5c 100755 --- a/models/rank/deepfm/data/download_preprocess.py +++ b/models/rank/deepfm/data/download_preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import shutil import sys diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index 59dc33b0d7b6aa1b2087134c9952fc160ca6cd04..6177c990d8ef0c8a1cf922dd9d50c6419cb8c1b7 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -19,8 +19,9 @@ try: import cPickle as pickle except ImportError: import pickle -class TrainReader(dg.MultiSlotDataGenerator): + +class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -44,7 +45,7 @@ class TrainReader(dg.MultiSlotDataGenerator): self.categorical_range_ = range(14, 40) # load preprocessed feature dict self.feat_dict_name = "aid_data/feat_dict_10.pkl2" - self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) + self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) def _process_line(self, line): features = line.rstrip('\n').split('\t') @@ -77,15 +78,18 @@ class TrainReader(dg.MultiSlotDataGenerator): def data_iter(): feat_idx, feat_value, label = self._process_line(line) s = "" - for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]: + for i in [('feat_idx', feat_idx), ('feat_value', feat_value), + ('label', label)]: k = i[0] v = i[1] for j in v: s += " " + k + ":" + str(j) print s.strip() yield None + return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/rank/deepfm/data/preprocess.py b/models/rank/deepfm/data/preprocess.py index 1fa4a5feae17bde64463d2f05beb3d053284dcda..9da3bdc3d93bfcd0dd98fddc64c870d20feddb38 100755 --- a/models/rank/deepfm/data/preprocess.py +++ b/models/rank/deepfm/data/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import numpy from collections import Counter diff --git a/models/rank/din/data/build_dataset.py b/models/rank/din/data/build_dataset.py index 34c053ccdb2686c10875740f72f1e0abf3cb4f10..b0ed187800b2f9f44d4dd0d34df204759059ac06 100755 --- a/models/rank/din/data/build_dataset.py +++ b/models/rank/din/data/build_dataset.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import random import pickle diff --git a/models/rank/din/data/convert_pd.py b/models/rank/din/data/convert_pd.py index d7927c7ef1a9da28732cad9c44be24e72095983a..a66290e1561084a10756ab98c3d70b9a5ac5a6ed 100755 --- a/models/rank/din/data/convert_pd.py +++ b/models/rank/din/data/convert_pd.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import pickle import pandas as pd diff --git a/models/rank/din/data/remap_id.py b/models/rank/din/data/remap_id.py index b110dac54de8f8d201ede7248d6a2844ac350c90..ee6983d7f0769a58352f61a0a05bbd81c6ccbc13 100755 --- a/models/rank/din/data/remap_id.py +++ b/models/rank/din/data/remap_id.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import random import pickle diff --git a/models/rank/dnn/data/get_slot_data.py b/models/rank/dnn/data/get_slot_data.py index 30ad9884e5b3c4cd600e8273b9d061bfe1398c9e..f52447d06c297335685a704f688d71aa871328bc 100755 --- a/models/rank/dnn/data/get_slot_data.py +++ b/models/rank/dnn/data/get_slot_data.py @@ -32,6 +32,7 @@ class CriteoDataset(dg.MultiSlotDataGenerator): """ Read the data line by line and process it as a dictionary """ + def reader(): """ This function needs to be implemented by the user, based on data format @@ -57,11 +58,12 @@ class CriteoDataset(dg.MultiSlotDataGenerator): feature_name.append("label") s = "click:" + str(label[0]) for i in dense_feature: - s += " dense_feature:" + str(i) + s += " dense_feature:" + str(i) for i in range(1, 1 + len(categorical_range_)): - s += " " + str(i) + ":" + str(sparse_feature[i-1][0]) + s += " " + str(i) + ":" + str(sparse_feature[i - 1][0]) print s.strip() yield None + return reader diff --git a/models/rank/wide_deep/data/data_preparation.py b/models/rank/wide_deep/data/data_preparation.py index cdd8d4d7817e8312fe76f4038c6554eb557a2ff1..885070096cd3fd084e9695919121f782505b9e77 100644 --- a/models/rank/wide_deep/data/data_preparation.py +++ b/models/rank/wide_deep/data/data_preparation.py @@ -1,10 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import io import args import pandas as pd -from sklearn import preprocessing +from sklearn import preprocessing + -def _clean_file(source_path,target_path): +def _clean_file(source_path, target_path): """makes changes to match the CSV format.""" with io.open(source_path, 'r') as temp_eval_file: with io.open(target_path, 'w') as eval_file: @@ -17,15 +32,16 @@ def _clean_file(source_path,target_path): line = line[:-1] line += '\n' eval_file.write(line) - + + def build_model_columns(train_data_path, test_data_path): # The column names are from # https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html column_names = [ - 'age', 'workclass', 'fnlwgt', 'education', 'education_num', - 'marital_status', 'occupation', 'relationship', 'race', 'gender', - 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', - 'income_bracket' + 'age', 'workclass', 'fnlwgt', 'education', 'education_num', + 'marital_status', 'occupation', 'relationship', 'race', 'gender', + 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', + 'income_bracket' ] # Load the dataset in Pandas @@ -44,61 +60,92 @@ def build_model_columns(train_data_path, test_data_path): # First group of tasks according to the paper #label_columns = ['income_50k', 'marital_stat'] - categorical_columns = ['education','marital_status','relationship','workclass','occupation'] + categorical_columns = [ + 'education', 'marital_status', 'relationship', 'workclass', + 'occupation' + ] for col in categorical_columns: label_train = preprocessing.LabelEncoder() - train_df[col]= label_train.fit_transform(train_df[col]) + train_df[col] = label_train.fit_transform(train_df[col]) label_test = preprocessing.LabelEncoder() - test_df[col]= label_test.fit_transform(test_df[col]) - - bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] - train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False) - test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False) - - base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets'] - - train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) - test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str) - train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) - test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str) - crossed_columns = ['education_occupation','age_buckets_education_occupation'] - + test_df[col] = label_test.fit_transform(test_df[col]) + + bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] + train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), + bins, + labels=False) + test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), + bins, + labels=False) + + base_columns = [ + 'education', 'marital_status', 'relationship', 'workclass', + 'occupation', 'age_buckets' + ] + + train_df['education_occupation'] = train_df['education'].astype( + str) + '_' + train_df['occupation'].astype(str) + test_df['education_occupation'] = test_df['education'].astype( + str) + '_' + test_df['occupation'].astype(str) + train_df['age_buckets_education_occupation'] = train_df[ + 'age_buckets'].astype(str) + '_' + train_df['education'].astype( + str) + '_' + train_df['occupation'].astype(str) + test_df['age_buckets_education_occupation'] = test_df[ + 'age_buckets'].astype(str) + '_' + test_df['education'].astype( + str) + '_' + test_df['occupation'].astype(str) + crossed_columns = [ + 'education_occupation', 'age_buckets_education_occupation' + ] + for col in crossed_columns: label_train = preprocessing.LabelEncoder() - train_df[col]= label_train.fit_transform(train_df[col]) + train_df[col] = label_train.fit_transform(train_df[col]) label_test = preprocessing.LabelEncoder() - test_df[col]= label_test.fit_transform(test_df[col]) - + test_df[col] = label_test.fit_transform(test_df[col]) + wide_columns = base_columns + crossed_columns - - train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns) - test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns) + + train_df_temp = pd.get_dummies( + train_df[categorical_columns], columns=categorical_columns) + test_df_temp = pd.get_dummies( + test_df[categorical_columns], columns=categorical_columns) train_df = train_df.join(train_df_temp) test_df = test_df.join(test_df_temp) - - deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week'] - - train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) - test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) - - with io.open('train_data/columns.txt','w') as f: - write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' + + deep_columns = list(train_df_temp.columns) + [ + 'age', 'education_num', 'capital_gain', 'capital_loss', + 'hours_per_week' + ] + + train_df['label'] = train_df['income_bracket'].apply( + lambda x: 1 if x == '>50K' else 0) + test_df['label'] = test_df['income_bracket'].apply( + lambda x: 1 if x == '>50K' else 0) + + with io.open('train_data/columns.txt', 'w') as f: + write_str = str(len(wide_columns)) + '\n' + str(len( + deep_columns)) + '\n' f.write(write_str) f.close() - with io.open('test_data/columns.txt','w') as f: - write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' + with io.open('test_data/columns.txt', 'w') as f: + write_str = str(len(wide_columns)) + '\n' + str(len( + deep_columns)) + '\n' f.write(write_str) f.close() - - train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False) - test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False) + + train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv( + train_data_path, index=False) + test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv( + test_data_path, index=False) def clean_file(train_path, test_path, train_data_path, test_data_path): _clean_file(train_path, train_data_path) _clean_file(test_path, test_data_path) + if __name__ == '__main__': args = args.parse_args() - clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path) + clean_file(args.train_path, args.test_path, args.train_data_path, + args.test_data_path) build_model_columns(args.train_data_path, args.test_data_path) diff --git a/models/rank/wide_deep/data/get_slot_data.py b/models/rank/wide_deep/data/get_slot_data.py index b928ae1267113215aa2b71f8dccffddc0db048fb..831d05665b01649f22a3270ec949ebda2941928d 100755 --- a/models/rank/wide_deep/data/get_slot_data.py +++ b/models/rank/wide_deep/data/get_slot_data.py @@ -20,6 +20,7 @@ except ImportError: import pickle import paddle.fluid.incubate.data_generator as dg + class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -50,7 +51,8 @@ class TrainReader(dg.MultiSlotDataGenerator): wide_feat, deep_deat, label = self._process_line(line) s = "" - for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]: + for i in [('wide_input', wide_feat), ('deep_input', deep_deat), + ('label', label)]: k = i[0] v = i[1] for j in v: @@ -60,6 +62,7 @@ class TrainReader(dg.MultiSlotDataGenerator): return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/rank/xdeepfm/data/download.py b/models/rank/xdeepfm/data/download.py index 4afd1ce28ec1ba99006414c6b5116178b8b28142..e46a9ced4a69339f5c5f6c45067d34bbbfa39469 100755 --- a/models/rank/xdeepfm/data/download.py +++ b/models/rank/xdeepfm/data/download.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import shutil import sys diff --git a/models/rank/xdeepfm/data/get_slot_data.py b/models/rank/xdeepfm/data/get_slot_data.py index d71444135c2198b426638bc4b2665ec053acb2aa..4426e9647c080dce5debdcdbc3e039ac69a69935 100755 --- a/models/rank/xdeepfm/data/get_slot_data.py +++ b/models/rank/xdeepfm/data/get_slot_data.py @@ -21,6 +21,7 @@ except ImportError: import pickle import paddle.fluid.incubate.data_generator as dg + class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) @@ -48,7 +49,8 @@ class TrainReader(dg.MultiSlotDataGenerator): feat_idx, feat_value, label = self._process_line(line) s = "" - for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]: + for i in [('feat_idx', feat_idx), ('feat_value', feat_value), + ('label', label)]: k = i[0] v = i[1] for j in v: @@ -58,6 +60,7 @@ class TrainReader(dg.MultiSlotDataGenerator): return data_iter + reader = TrainReader("../config.yaml") reader.init() reader.run_from_stdin() diff --git a/models/recall/gnn/data_process.sh b/models/recall/gnn/data_process.sh index 38877b6906ecd65ef190aae5f1dcf5a74cece6d0..fc7ed827e0368c59cab8134d22f78e2200980f18 100755 --- a/models/recall/gnn/data_process.sh +++ b/models/recall/gnn/data_process.sh @@ -31,5 +31,3 @@ mv diginetica/train.txt train_data mkdir test_data mv diginetica/test.txt test_data - - diff --git a/models/recall/gnn/raw_data/convert_data.py b/models/recall/gnn/raw_data/convert_data.py index 2e0e57f1f781f7210c46ef265e1189e99a6f7a96..dfe6bc49fcfca0b98ed5cb0ee9d41832dc5c2205 100755 --- a/models/recall/gnn/raw_data/convert_data.py +++ b/models/recall/gnn/raw_data/convert_data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import time import pickle @@ -10,6 +24,7 @@ parser.add_argument( help='dataset dir: diginetica/yoochoose1_4/yoochoose1_64/sample') opt = parser.parse_args() + def process_data(file_type): path = os.path.join(opt.data_dir, file_type) output_path = os.path.splitext(path)[0] + ".txt" @@ -23,6 +38,7 @@ def process_data(file_type): fout.write(str(data[i][1])) fout.write("\n") + process_data("train") process_data("test") diff --git a/models/recall/gnn/raw_data/download.py b/models/recall/gnn/raw_data/download.py index 69a1ee20b2d634e9eca47c621dce82ac2d98b5f2..9bebdf1b37e2cd45369c14bb7446c206de8017a0 100755 --- a/models/recall/gnn/raw_data/download.py +++ b/models/recall/gnn/raw_data/download.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import requests import sys import time diff --git a/models/recall/readme.md b/models/recall/readme.md index 664ced053934d461fb2ed4311a8fd4a1f4d9bd8a..421df1315dc22396f2ff3bb5aec99508435e2c8d 100755 --- a/models/recall/readme.md +++ b/models/recall/readme.md @@ -78,4 +78,3 @@ python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn | MOVIELENS | NCF | 0.688 | -- | | -- | Youtube | -- | -- | | 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 | - diff --git a/models/recall/word2vec/prepare_data.sh b/models/recall/word2vec/prepare_data.sh index 8b78eeedd94f088e206e35729a6b35d349b99039..cfd067350ce1d33112806ab72ca78222381a86f4 100755 --- a/models/recall/word2vec/prepare_data.sh +++ b/models/recall/word2vec/prepare_data.sh @@ -35,6 +35,3 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.ta tar xzvf test_dir.tar -C raw_data mv raw_data/data/test_dir test_data/ rm -rf raw_data - - - diff --git a/models/treebased/README.md b/models/treebased/README.md index 8a8317d17be5148f0652b7944442e1929a7684af..3ceb13b62eba8127aa0394397d141b2abe343a32 100644 --- a/models/treebased/README.md +++ b/models/treebased/README.md @@ -24,4 +24,4 @@ TDM是为大规模推荐系统设计的、能承载任意先进模型来高效 - 如何组网?答:paddle封装了大量的深度学习OP,用户可以根据需求设计自己的网络结构。 - 训练数据如何组织?答:tdm的训练数据主要为:`user/query emb` 加 `item`的正样本,`item`需要映射到树的某个叶子节点。用户只需准备符合该构成的数据即可。负样本的生成,会基于用户提供的树结构,以及paddle提供的`tdm-sampler op`完成高效的负采样,并自动添加相应的label,参与tdm中深度学习模型的训练。 - 大规模的数据与模型训练如何实现?答:基于paddle优秀的大规模参数服务器分布式能力,可以实现高效的分布式训练。基于paddle-fleet api,学习门槛极低,且可以灵活的支持增量训练,流式训练等业务需求。 -3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。 \ No newline at end of file +3. 训练好模型后,可以基于paddle,将检索与打分等流程都融入paddle的组网中,生成inference_model与参数文件,基于PaddlePaddle的预测库或者PaddleLite进行快速部署与高效检索。 diff --git a/models/treebased/tdm/tree/layer_list.txt b/models/treebased/tdm/tree/layer_list.txt index d8606bc601202390bd9aa54197fac8f34e3c5b59..d1c6c50a10f1b40aa1fbdef7d57bdd600549fb11 100755 --- a/models/treebased/tdm/tree/layer_list.txt +++ b/models/treebased/tdm/tree/layer_list.txt @@ -1,4 +1,4 @@ 1,2 3,4,5,6 7,8,9,10,11,12,13 -14,15,16,17,18,19,20,21,22,23,24,25 \ No newline at end of file +14,15,16,17,18,19,20,21,22,23,24,25 diff --git a/run.py b/run.py index 56999935f21bc1de2b2bc7b4a080da023559174a..c80c647d0c8bab5cd9918429f9bf460b6093335d 100755 --- a/run.py +++ b/run.py @@ -26,8 +26,10 @@ from paddlerec.core.utils import util engines = {} device = ["CPU", "GPU"] clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"] -engine_choices = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER", - "TDM_SINGLE", "TDM_LOCAL_CLUSTER", "TDM_CLUSTER"] +engine_choices = [ + "SINGLE", "LOCAL_CLUSTER", "CLUSTER", "TDM_SINGLE", "TDM_LOCAL_CLUSTER", + "TDM_CLUSTER" +] custom_model = ['TDM'] model_name = "" @@ -66,7 +68,8 @@ def get_engine(args): engine = engine.upper() if engine not in engine_choices: - raise ValueError("train.engin can not be chosen in {}".format(engine_choices)) + raise ValueError("train.engin can not be chosen in {}".format( + engine_choices)) print("engines: \n{}".format(engines)) @@ -77,8 +80,10 @@ def get_engine(args): def get_transpiler(): FNULL = open(os.devnull, 'w') - cmd = ["python", "-c", - "import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];"] + cmd = [ + "python", "-c", + "import paddle.fluid as fluid; fleet_ptr = fluid.core.Fleet(); [fleet_ptr.copy_table_by_feasign(10, 10, [2020, 1010])];" + ] proc = subprocess.Popen(cmd, stdout=FNULL, stderr=FNULL, cwd=os.getcwd()) ret = proc.wait() if ret == -11: @@ -152,7 +157,8 @@ def cluster_engine(args): update_workspace(flattens) envs.set_runtime_environs(flattens) - print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"))) + print(envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value" + ))) launch = ClusterEngine(None, args.model) return launch @@ -163,7 +169,8 @@ def cluster_engine(args): cluster_envs = {} cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" - cluster_envs["train.trainer.threads"] = envs.get_runtime_environ("CPU_NUM") + cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( + "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) @@ -181,7 +188,8 @@ def cluster_engine(args): def cluster_mpi_engine(args): - print("launch cluster engine with cluster to run model: {}".format(args.model)) + print("launch cluster engine with cluster to run model: {}".format( + args.model)) cluster_envs = {} cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer" @@ -209,7 +217,8 @@ def local_cluster_engine(args): cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" - print("launch {} engine with cluster to run model: {}".format(trainer, args.model)) + print("launch {} engine with cluster to run model: {}".format(trainer, + args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) @@ -217,10 +226,12 @@ def local_cluster_engine(args): def local_mpi_engine(args): - print("launch cluster engine with cluster to run model: {}".format(args.model)) + print("launch cluster engine with cluster to run model: {}".format( + args.model)) from paddlerec.core.engine.local_mpi import LocalMPIEngine - print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(args.model)) + print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format( + args.model)) mpi = util.run_which("mpirun") if not mpi: diff --git a/setup.py b/setup.py index c655c37576e310fac825bd1cc01dfca5d051d18c..31bb34f03187dc9ab29c4cc5c75c559540ca8269 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ setup for paddle-rec. """ @@ -22,11 +21,7 @@ from setuptools import setup, find_packages import shutil import tempfile - -requires = [ - "paddlepaddle == 1.7.2", - "pyyaml >= 5.1.1" -] +requires = ["paddlepaddle == 1.7.2", "pyyaml >= 5.1.1"] about = {} about["__title__"] = "paddle-rec" @@ -48,18 +43,27 @@ def build(dirname): package_dir = os.path.dirname(os.path.abspath(__file__)) run_cmd("cp -r {}/* {}".format(package_dir, dirname)) run_cmd("mkdir {}".format(os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec"))) - run_cmd("mv {} {}".format(os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "core"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "doc"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "models"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "tests"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "tools"), os.path.join(dirname, "paddlerec"))) + run_cmd("mv {} {}".format( + os.path.join(dirname, "*.py"), os.path.join(dirname, "paddlerec"))) packages = find_packages(dirname, include=('paddlerec.*')) package_dir = {'': dirname} package_data = {} - models_copy = ['data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'tree/*.txt'] + models_copy = [ + 'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', + 'tree/*.txt' + ] engine_copy = ['*/*.sh'] for package in packages: if package.startswith("paddlerec.models."): @@ -80,8 +84,7 @@ def build(dirname): package_data=package_data, python_requires=">=2.7", install_requires=requires, - zip_safe=False - ) + zip_safe=False) dirname = tempfile.mkdtemp()