diff --git a/core/engine/cluster/cloud/cluster.sh b/core/engine/cluster/cloud/cluster.sh index 1140d791221b93065b726411acbf3477834467dc..1a0605fd9aeefbf87542e5e5156470eb1d81b836 100644 --- a/core/engine/cluster/cloud/cluster.sh +++ b/core/engine/cluster/cloud/cluster.sh @@ -1,4 +1,18 @@ #!/bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + ################################################### # Usage: submit.sh diff --git a/core/engine/cluster/cluster.py b/core/engine/cluster/cluster.py index 0f45b1b0bb97e810745d78ba3af218a2e36adcfb..6dfcec3a929ad8124192014f48270ebd1862dc2c 100644 --- a/core/engine/cluster/cluster.py +++ b/core/engine/cluster/cluster.py @@ -15,10 +15,9 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os import copy +import os +import subprocess from paddlerec.core.engine.engine import Engine from paddlerec.core.factory import TrainerFactory diff --git a/core/engine/engine.py b/core/engine/engine.py index 583c3f18c8c38b7a91ac50e2bde1bdbc88ba894f..492bf8e1c6f83f015be4fbd287ebef7d432e953d 100755 --- a/core/engine/engine.py +++ b/core/engine/engine.py @@ -29,4 +29,3 @@ class Engine: @abc.abstractmethod def run(self): pass - diff --git a/core/engine/local_cluster.py b/core/engine/local_cluster.py index 72d440a825874b7334502cf3cab6ba6dd2ef4215..4cf614f02315acbff2a3c21126d8c061c10ba8ad 100755 --- a/core/engine/local_cluster.py +++ b/core/engine/local_cluster.py @@ -14,10 +14,11 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os + import copy +import os +import sys +import subprocess from paddlerec.core.engine.engine import Engine from paddlerec.core.utils import envs diff --git a/core/engine/local_mpi.py b/core/engine/local_mpi.py index 1f7a27a849e8629c56c1cf26d90b116569517454..49db821fe5764ae9ef7f42cbd3ca2fe77b83a1d1 100755 --- a/core/engine/local_mpi.py +++ b/core/engine/local_mpi.py @@ -14,10 +14,11 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os + import copy +import os +import sys +import subprocess from paddlerec.core.engine.engine import Engine diff --git a/core/layer.py b/core/layer.py index 3b4a9ca34a293539777ae886988540355938b558..07c058c77d3fb947334d5d8fca89b15e2629c724 100755 --- a/core/layer.py +++ b/core/layer.py @@ -25,17 +25,8 @@ class Layer(object): """ pass - def generate(self, mode, param): - """R - """ - if mode == 'fluid': - return self.generate_fluid(param) - - print('unsupport this mode: ' + mode) - return None, None - @abc.abstractmethod - def generate_fluid(self, param): + def generate(self, param): """R """ pass diff --git a/core/metric.py b/core/metric.py index 469fd56c8cb97c8e21739b402b4b89daab57bde9..e0f6b24e7e6bfc3e4e1689622019ffd540c8c033 100755 --- a/core/metric.py +++ b/core/metric.py @@ -53,7 +53,7 @@ class Metric(object): pass @abc.abstractmethod - def get_result_to_string(self): + def __str__(self): """ Return: result(string) : calculate result with string format, for output diff --git a/core/metrics/auc_metrics.py b/core/metrics/auc_metrics.py index 3c48040db29fc31846780f6f4e871168ac65f3f8..5dd16cc078aa43d8fb07a50a4b006d4fdae3b2e9 100755 --- a/core/metrics/auc_metrics.py +++ b/core/metrics/auc_metrics.py @@ -13,8 +13,10 @@ # limitations under the License. import math + import numpy as np import paddle.fluid as fluid + from paddlerec.core.metric import Metric @@ -198,7 +200,7 @@ class AUCMetric(Metric): """ """ return self._result - def get_result_to_string(self): + def __str__(self): """ """ result = self.get_result() result_str = "%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f " \ diff --git a/core/model.py b/core/model.py index c51ba2417da7d6d7530f29c2b0bc9f5090d810ea..b4150155db9677124eabab079f003743fc6c4d8b 100755 --- a/core/model.py +++ b/core/model.py @@ -47,7 +47,7 @@ class Model(object): def get_infer_results(self): return self._infer_results - def get_cost_op(self): + def get_avg_cost(self): """R """ return self._cost diff --git a/core/modules/modul/build.py b/core/modules/modul/build.py index fe065678713d777519b8bbfd909a7c4c16f72439..0263cbf60e3b1647a05cbc471b7bbff1840f88ba 100755 --- a/core/modules/modul/build.py +++ b/core/modules/modul/build.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import yaml import copy + import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet +import yaml from paddlerec.core.model import Model from paddlerec.core.utils import table diff --git a/core/modules/modul/layers.py b/core/modules/modul/layers.py index f533a46db24da5e6feb3e8723ff3265529cfbcb3..26ee98a816a63c4121428b2dd5d2c835d05f7216 100755 --- a/core/modules/modul/layers.py +++ b/core/modules/modul/layers.py @@ -13,10 +13,11 @@ # limitations under the License. import paddle.fluid as fluid + from paddlerec.core.layer import Layer -class EmbeddingInputLayer(Layer): +class EmbeddingFuseLayer(Layer): """R """ @@ -31,7 +32,7 @@ class EmbeddingInputLayer(Layer): self._emb_dim = self._mf_dim + 3 # append show ctr lr self._emb_layers = [] - def generate_fluid(self, param): + def generate(self, param): """R """ show_clk = fluid.layers.concat( @@ -63,7 +64,7 @@ class LabelInputLayer(Layer): self._data_type = config.get('data_type', "int64") self._label_idx = config['label_idx'] - def generate_fluid(self, param): + def generate(self, param): """R """ label = fluid.layers.data(name=self._name, shape=[-1, self._dim], \ @@ -85,7 +86,7 @@ class TagInputLayer(Layer): self._dim = config.get('dim', 1) self._data_type = config['data_type'] - def generate_fluid(self, param): + def generate(self, param): """R """ output = fluid.layers.data(name=self._name, shape=[-1, self._dim], \ @@ -107,7 +108,7 @@ class ParamLayer(Layer): self._data_type = config.get('data_type', 'float32') self._config = config - def generate_fluid(self, param): + def generate(self, param): """R """ return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}} @@ -125,7 +126,7 @@ class SummaryLayer(Layer): self._data_type = config.get('data_type', 'float32') self._config = config - def generate_fluid(self, param): + def generate(self, param): """R """ return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}} @@ -143,7 +144,7 @@ class NormalizetionLayer(Layer): self._summary = config['summary'] self._table_id = config.get('table_id', -1) - def generate_fluid(self, param): + def generate(self, param): """R """ input_layer = param['layer'][self._input[0]] @@ -158,7 +159,7 @@ class NormalizetionLayer(Layer): 'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}} -class NeuralLayer(Layer): +class FCLayer(Layer): """R """ @@ -171,7 +172,7 @@ class NeuralLayer(Layer): self._bias = config.get('bias', True) self._act_func = config.get('act_func', None) - def generate_fluid(self, param): + def generate(self, param): """R """ param_layer = param['layer'][self._param] @@ -199,7 +200,7 @@ class NeuralLayer(Layer): 'table_id': param_layer.get('table_id', -1)}} -class SigmoidLossLayer(Layer): +class LogLossLayer(Layer): """R """ @@ -230,7 +231,7 @@ class SigmoidLossLayer(Layer): } } - def generate_fluid(self, param): + def generate(self, param): """R """ input_layer = param['layer'][self._input[0]] diff --git a/core/trainer.py b/core/trainer.py index df5626b7a5317b9fd7f108964adf11afe0c59c0e..40fc35de973ce7841bfdf28dfc6c6a3751484be7 100755 --- a/core/trainer.py +++ b/core/trainer.py @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys - import abc +import os import time +import sys import yaml from paddle import fluid + from paddlerec.core.utils import envs diff --git a/core/trainers/__init__.py b/core/trainers/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cd9c9db5e6b93fd6171bca0a5b0f97f69306aedc 100755 --- a/core/trainers/__init__.py +++ b/core/trainers/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +trainer implement. + + ↗ (single/cluster) CtrTrainer +Trainer + ↗ (for single training) SingleTrainer/TDMSingleTrainer + ↘ TranspilerTrainer → (for cluster training) ClusterTrainer/TDMClusterTrainer + ↘ (for online learning training) OnlineLearningTrainer + +""" + + diff --git a/core/trainers/cluster_trainer.py b/core/trainers/cluster_trainer.py index fa4238bf3e4e188df124c298e15211454f2c2ade..faa960359bc82d6130302002a99fb664c7374249 100755 --- a/core/trainers/cluster_trainer.py +++ b/core/trainers/cluster_trainer.py @@ -25,7 +25,6 @@ import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker -from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker from paddlerec.core.utils import envs from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer @@ -83,7 +82,7 @@ class ClusterTrainer(TranspileTrainer): strategy = self.build_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer.minimize(self.model.get_cost_op()) + optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): context['status'] = 'server_pass' @@ -115,7 +114,7 @@ class ClusterTrainer(TranspileTrainer): program = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( - loss_name=self.model.get_cost_op().name, + loss_name=self.model.get_avg_cost().name, build_strategy=self.strategy.get_build_strategy(), exec_strategy=self.strategy.get_execute_strategy()) diff --git a/core/trainers/ctr_coding_trainer.py b/core/trainers/ctr_coding_trainer.py index a6377c853853d3f1ab53c55b9a5253cf0ff8d5b7..3bfec28cfd149bdbe47fdc202107c7ed7af58fdd 100755 --- a/core/trainers/ctr_coding_trainer.py +++ b/core/trainers/ctr_coding_trainer.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os -import numpy as np +import numpy as np import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker @@ -22,7 +23,7 @@ from paddlerec.core.utils import envs from paddlerec.core.trainer import Trainer -class CtrPaddleTrainer(Trainer): +class CtrTrainer(Trainer): """R """ @@ -87,7 +88,7 @@ class CtrPaddleTrainer(Trainer): optimizer = self.model.optimizer() optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False}) - optimizer.minimize(self.model.get_cost_op()) + optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): context['status'] = 'server_pass' diff --git a/core/trainers/ctr_modul_trainer.py b/core/trainers/ctr_modul_trainer.py index 8c99a5c885ed1b8a577556c52d56c4a8ad04944c..7b3bd7874359059c3b03289cc10da7d7756ac35b 100755 --- a/core/trainers/ctr_modul_trainer.py +++ b/core/trainers/ctr_modul_trainer.py @@ -13,12 +13,12 @@ # limitations under the License. +import datetime +import json import sys import time -import json -import datetime -import numpy as np +import numpy as np import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker @@ -72,7 +72,7 @@ def worker_numric_max(value, env="mpi"): return wroker_numric_opt(value, env, "max") -class CtrPaddleTrainer(Trainer): +class CtrTrainer(Trainer): """R """ @@ -129,7 +129,7 @@ class CtrPaddleTrainer(Trainer): model = self._exector_context[executor['name']]['model'] self._metrics.update(model.get_metrics()) runnnable_scope.append(scope) - runnnable_cost_op.append(model.get_cost_op()) + runnnable_cost_op.append(model.get_avg_cost()) for var in model._data_var: if var.name in data_var_name_dict: continue @@ -146,7 +146,7 @@ class CtrPaddleTrainer(Trainer): model = self._exector_context[executor['name']]['model'] program = model._build_param['model']['train_program'] if not executor['is_update_sparse']: - program._fleet_opt["program_configs"][str(id(model.get_cost_op().block.program))]["push_sparse"] = [] + program._fleet_opt["program_configs"][str(id(model.get_avg_cost().block.program))]["push_sparse"] = [] if 'train_thread_num' not in executor: executor['train_thread_num'] = self.global_config['train_thread_num'] with fluid.scope_guard(scope): diff --git a/core/trainers/online_learning_trainer.py b/core/trainers/online_learning_trainer.py index 1924b966dd2ba34c4ca10e21471b39b4d3e793ab..0303e96ac0bb20b1f46cdc9f5836d18fa73b9a8e 100755 --- a/core/trainers/online_learning_trainer.py +++ b/core/trainers/online_learning_trainer.py @@ -18,9 +18,9 @@ Training use fluid with one node only. from __future__ import print_function +import datetime import os import time -import datetime import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet @@ -31,7 +31,7 @@ from paddlerec.core.utils import envs from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer -class ClusterTrainer(TranspileTrainer): +class OnlineLearningTrainer(TranspileTrainer): def processor_register(self): role = PaddleCloudRoleMaker() fleet.init(role) @@ -78,7 +78,7 @@ class ClusterTrainer(TranspileTrainer): optimizer = self.model.optimizer() strategy = self.build_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer.minimize(self.model.get_cost_op()) + optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): context['status'] = 'server_pass' diff --git a/core/trainers/single_trainer.py b/core/trainers/single_trainer.py index 888a93f3ddb7c1045505fc98eebc15d6dd19e950..8079377ba257041e4946d6e452cacaa388ca36ce 100755 --- a/core/trainers/single_trainer.py +++ b/core/trainers/single_trainer.py @@ -17,14 +17,14 @@ Training use fluid with one node only. """ from __future__ import print_function -import logging + import time +import logging import paddle.fluid as fluid from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer from paddlerec.core.utils import envs -import numpy as np logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") @@ -36,7 +36,8 @@ class SingleTrainer(TranspileTrainer): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) - if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": + if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, + "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: self.regist_context_processor('train_pass', self.dataloader_train) @@ -47,7 +48,7 @@ class SingleTrainer(TranspileTrainer): def init(self, context): self.model.train_net() optimizer = self.model.optimizer() - optimizer.minimize((self.model.get_cost_op())) + optimizer.minimize((self.model.get_avg_cost())) self.fetch_vars = [] self.fetch_alias = [] @@ -74,7 +75,7 @@ class SingleTrainer(TranspileTrainer): program = fluid.compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( - loss_name=self.model.get_cost_op().name) + loss_name=self.model.get_avg_cost().name) metrics_varnames = [] metrics_format = [] @@ -122,8 +123,8 @@ class SingleTrainer(TranspileTrainer): fetch_info=self.fetch_alias, print_period=self.fetch_period) end_time = time.time() - times = end_time-begin_time - print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) + times = end_time - begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times)) self.save(i, "train", is_fleet=False) context['status'] = 'infer_pass' diff --git a/core/trainers/tdm_cluster_trainer.py b/core/trainers/tdm_cluster_trainer.py index f4edc62a20d5f20eaac5d9f441d7a84522cba1f5..3bd1ad3367f340019333e8f83cf5abdd3b36b25f 100755 --- a/core/trainers/tdm_cluster_trainer.py +++ b/core/trainers/tdm_cluster_trainer.py @@ -17,17 +17,16 @@ Training use fluid with one node only. """ from __future__ import print_function + import logging + import numpy as np import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory -from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker from paddlerec.core.utils import envs from paddlerec.core.trainers.cluster_trainer import ClusterTrainer - logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) diff --git a/core/trainers/tdm_single_trainer.py b/core/trainers/tdm_single_trainer.py index dce3c8d543a11ea54d6d636a0b770b1641fed967..21be66a677750f6e817b63794819b14ed72d9fa2 100755 --- a/core/trainers/tdm_single_trainer.py +++ b/core/trainers/tdm_single_trainer.py @@ -18,12 +18,11 @@ Training use fluid with one node only. from __future__ import print_function import logging -import paddle.fluid as fluid -from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer +import numpy as np +import paddle.fluid as fluid from paddlerec.core.trainers.single_trainer import SingleTrainer from paddlerec.core.utils import envs -import numpy as np logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") diff --git a/core/trainers/transpiler_trainer.py b/core/trainers/transpiler_trainer.py index 008b67b74c1667b2d8c2ca1a4f52aeb7e72470d9..81591056c94dc414fdeeba12d449f18aaaa0e216 100755 --- a/core/trainers/transpiler_trainer.py +++ b/core/trainers/transpiler_trainer.py @@ -147,8 +147,8 @@ class TranspileTrainer(Trainer): if not need_save(epoch_id, save_interval, False): return - # print("save inference model is not supported now.") - # return + # print("save inference model is not supported now.") + # return feed_varnames = envs.get_global_env( "save.inference.feed_varnames", None, namespace) @@ -248,7 +248,7 @@ class TranspileTrainer(Trainer): 'evaluate_model_path', "", namespace='evaluate'))] is_return_numpy = envs.get_global_env( - 'is_return_numpy', True, namespace='evaluate') + 'is_return_numpy', True, namespace='evaluate') for (epoch, model_dir) in model_list: print("Begin to infer No.{} model, model_dir: {}".format( diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index 88b4a9f63662861f41ab2971f4fe751bf5c65ce3..6234882f4d49191d3b55770078863910a356e9cd 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -14,7 +14,6 @@ from __future__ import print_function import os -import sys from paddlerec.core.utils.envs import lazy_instance_by_fliename from paddlerec.core.utils.envs import get_global_env diff --git a/core/utils/dataset.py b/core/utils/dataset_holder.py similarity index 96% rename from core/utils/dataset.py rename to core/utils/dataset_holder.py index 0eff8da071920b9931ce073ff94abe253baa31a0..cd195450336cac0265f76670ca0e3fa24c45a7ba 100755 --- a/core/utils/dataset.py +++ b/core/utils/dataset_holder.py @@ -13,8 +13,8 @@ # limitations under the License. import abc -import time import datetime +import time import paddle.fluid as fluid @@ -22,7 +22,7 @@ from paddlerec.core.utils import fs as fs from paddlerec.core.utils import util as util -class Dataset(object): +class DatasetHolder(object): """ Dataset Base """ @@ -62,7 +62,7 @@ class Dataset(object): pass -class TimeSplitDataset(Dataset): +class TimeSplitDatasetHolder(DatasetHolder): """ Dataset with time split dir. root_path/$DAY/$HOUR """ @@ -142,16 +142,6 @@ class TimeSplitDataset(Dataset): data_time = data_time + datetime.timedelta(minutes=self._split_interval) return data_file_list - -class FluidTimeSplitDataset(TimeSplitDataset): - """ - A Dataset with time split for PaddleFluid - """ - - def __init__(self, config): - """ """ - TimeSplitDataset.__init__(self, config) - def _alloc_dataset(self, file_list): """ """ dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type']) diff --git a/core/utils/dataset_instance.py b/core/utils/dataset_instance.py index 94997b669a5579cae4a403422c92fe277fa046b6..731b3b47169d9e67735953c8488469d4d60cb296 100755 --- a/core/utils/dataset_instance.py +++ b/core/utils/dataset_instance.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function import sys diff --git a/core/utils/envs.py b/core/utils/envs.py index 9cbd93fc241b432f3a4e166ee114bc62cc176860..7093d897e780c525e91516a0058bc90319d4e918 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from contextlib import closing import copy -import sys +import os import socket -from contextlib import closing +import sys global_envs = {} diff --git a/core/utils/fs.py b/core/utils/fs.py index df2379721041197170945ebfd03ccf8f8cc61d0a..836c6f598b9c423b0922e30f536a669c55e83098 100755 --- a/core/utils/fs.py +++ b/core/utils/fs.py @@ -13,6 +13,7 @@ # limitations under the License. import os + from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient @@ -28,12 +29,12 @@ class LocalFSClient(object): """ Util for local disk file_system io """ - + def __init__(self): """R """ pass - + def write(self, content, path, mode): """ write to file @@ -43,7 +44,7 @@ class LocalFSClient(object): mode(string): w/a w:clear_write a:append_write """ temp_dir = os.path.dirname(path) - if not os.path.exists(temp_dir): + if not os.path.exists(temp_dir): os.makedirs(temp_dir) f = open(path, mode) f.write(content) @@ -75,7 +76,7 @@ class LocalFSClient(object): """R """ os.system("rm -rf " + path) - + def is_exist(self, path): """R """ @@ -94,13 +95,14 @@ class FileHandler(object): """ A Smart file handler. auto judge local/afs by path """ + def __init__(self, config): """R """ if 'fs_name' in config: - hadoop_home="$HADOOP_HOME" + hadoop_home = "$HADOOP_HOME" hdfs_configs = { - "hadoop.job.ugi": config['fs_ugi'], + "hadoop.job.ugi": config['fs_ugi'], "fs.default.name": config['fs_name'] } self._hdfs_client = HDFSClient(hadoop_home, hdfs_configs) @@ -131,7 +133,8 @@ class FileHandler(object): if mode.find('a') >= 0: org_content = self._hdfs_client.cat(dest_path) content = content + org_content - self._local_fs_client.write(content, temp_local_file, mode) #fleet hdfs_client only support upload, so write tmp file + self._local_fs_client.write(content, temp_local_file, + mode) # fleet hdfs_client only support upload, so write tmp file self._hdfs_client.delete(dest_path + ".tmp") self._hdfs_client.upload(dest_path + ".tmp", temp_local_file) self._hdfs_client.delete(dest_path + ".bak") @@ -139,7 +142,7 @@ class FileHandler(object): self._hdfs_client.rename(dest_path + ".tmp", dest_path) else: self._local_fs_client.write(content, dest_path, mode) - + def cat(self, path): """R """ @@ -148,7 +151,7 @@ class FileHandler(object): return hdfs_cat else: return self._local_fs_client.cat(path) - + def ls(self, path): """R """ @@ -160,7 +163,7 @@ class FileHandler(object): files = self._local_fs_client.ls(path) files = [path + '/' + fi for fi in files] # absulte path return files - + def cp(self, org_path, dest_path): """R """ @@ -170,6 +173,6 @@ class FileHandler(object): return self._local_fs_client.cp(org_path, dest_path) if not org_is_afs and dest_is_afs: return self._hdfs_client.upload(dest_path, org_path) - if org_is_afs and not dest_is_afs: + if org_is_afs and not dest_is_afs: return self._hdfs_client.download(org_path, dest_path) print("Not Suppor hdfs cp currently") diff --git a/core/utils/table.py b/core/utils/table.py index 7c86d0f03edb47a7835ab7364ebe318985d091e2..558cd26d61b1be165e964b4dea3a1f3dfe82e0ba 100755 --- a/core/utils/table.py +++ b/core/utils/table.py @@ -12,16 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -import yaml - class TableMeta(object): """ Simple ParamTable Meta, Contain table_id """ TableId = 1 - + @staticmethod def alloc_new_table(table_id): """ diff --git a/core/utils/util.py b/core/utils/util.py index 76965f50ccf298dd63bd67e1c74d4f3d23c2fe72..bd63284873b6c6be80c9849f40535cebe1b7fb14 100755 --- a/core/utils/util.py +++ b/core/utils/util.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import os import time -import datetime from paddle import fluid + from paddlerec.core.utils import fs as fs diff --git a/doc/design.md b/doc/design.md index 90a819fe5a42fa584930d628e33ed76e451e823f..2975d77f14e461547921f74b9ced5cf73703e2e7 100644 --- a/doc/design.md +++ b/doc/design.md @@ -153,7 +153,7 @@ class Model(object): def infer_net(self): pass - def get_cost_op(self): + def get_avg_cost(self): return self._cost ``` diff --git a/models/contentunderstanding/classification/model.py b/models/contentunderstanding/classification/model.py index e4630820c868af8334fc8edfd2b6c1f4d9e77503..9e853aa01d4a0b6bd5c7a20d8e13164bd9905ad0 100644 --- a/models/contentunderstanding/classification/model.py +++ b/models/contentunderstanding/classification/model.py @@ -13,15 +13,9 @@ # limitations under the License. import paddle.fluid as fluid -import math -from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase -import paddle.fluid as fluid -import paddle.fluid.layers.nn as nn -import paddle.fluid.layers.tensor as tensor -import paddle.fluid.layers.control_flow as cf class Model(ModelBase): def __init__(self, config): @@ -36,7 +30,7 @@ class Model(ModelBase): def train_net(self): """ network definition """ - + data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') @@ -60,12 +54,12 @@ class Model(ModelBase): prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) + acc = fluid.layers.accuracy(input=prediction, label=label) self.cost = avg_cost self._metrics["acc"] = acc - def get_cost_op(self): + def get_avg_cost(self): return self.cost def get_metrics(self): diff --git a/models/contentunderstanding/classification/reader.py b/models/contentunderstanding/classification/reader.py index f90097d702df461d226443c32570e46ea3a0b093..136a5668856c0fb558a016a3bc3a0b8a56651d3b 100644 --- a/models/contentunderstanding/classification/reader.py +++ b/models/contentunderstanding/classification/reader.py @@ -12,31 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re + import sys -import collections -import os -import six -import time -import numpy as np -import paddle.fluid as fluid -import paddle -import csv -import io from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs + class TrainReader(Reader): def init(self): pass - def _process_line(self, l): + def _process_line(self, l): l = l.strip().split(" ") data = l[0:10] seq_len = l[10:11] label = l[11:] - return data, label, seq_len + return data, label, seq_len def generate_sample(self, line): def data_iter(): @@ -47,6 +38,7 @@ class TrainReader(Reader): data = [int(i) for i in data] label = [int(i) for i in label] seq_len = [int(i) for i in seq_len] - print >>sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) + print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) yield [('data', data), ('label', label), ('seq_len', seq_len)] + return data_iter diff --git a/models/contentunderstanding/tagspace/model.py b/models/contentunderstanding/tagspace/model.py index 8b73da91513e34780d2c0f242ac58a5adb970c9a..033d51b8f5d50ddcb1199f566b679eff61acfccb 100644 --- a/models/contentunderstanding/tagspace/model.py +++ b/models/contentunderstanding/tagspace/model.py @@ -12,30 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid as fluid -import math - -from paddlerec.core.utils import envs -from paddlerec.core.model import Model as ModelBase - import paddle.fluid as fluid import paddle.fluid.layers.nn as nn import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.control_flow as cf +from paddlerec.core.model import Model as ModelBase +from paddlerec.core.utils import envs + + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) self.cost = None self.metrics = {} - self.vocab_text_size = 11447#envs.get_global_env("vocab_text_size", None, self._namespace) - self.vocab_tag_size = 4#envs.get_global_env("vocab_tag_size", None, self._namespace) - self.emb_dim = 10#envs.get_global_env("emb_dim", None, self._namespace) - self.hid_dim = 1000#envs.get_global_env("hid_dim", None, self._namespace) - self.win_size = 5#envs.get_global_env("win_size", None, self._namespace) - self.margin = 0.1#envs.get_global_env("margin", None, self._namespace) - self.neg_size = 3#envs.get_global_env("neg_size", None, self._namespace) - print self.emb_dim + self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace) + self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace) + self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace) + self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace) + self.win_size = envs.get_global_env("win_size", None, self._namespace) + self.margin = envs.get_global_env("margin", None, self._namespace) + self.neg_size = envs.get_global_env("neg_size", None, self._namespace) def train_net(self): """ network definition """ @@ -92,18 +89,16 @@ class Model(ModelBase): self.metrics["correct"] = correct self.metrics["cos_pos"] = cos_pos - def get_cost_op(self): + def get_avg_cost(self): return self.cost def get_metrics(self): return self.metrics def optimizer(self): - learning_rate = 0.01#envs.get_global_env("hyper_parameters.base_lr", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) - #sgd_optimizer.minimize(avg_cost) return sgd_optimizer - def infer_net(self, parameter_list): self.train_net() diff --git a/models/contentunderstanding/tagspace/reader.py b/models/contentunderstanding/tagspace/reader.py index fb973f7eb92eb9d1195a8c9e9aeaa3be52b2dc17..0f63b85fd1a322b55c6d0e451fe61ff90c82eaa5 100644 --- a/models/contentunderstanding/tagspace/reader.py +++ b/models/contentunderstanding/tagspace/reader.py @@ -12,26 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re + import sys -import collections -import os -import six -import time + import numpy as np -import paddle.fluid as fluid -import paddle -import csv -import io from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs + class TrainReader(Reader): def init(self): pass - def _process_line(self, l): + def _process_line(self, l): tag_size = 4 neg_size = 3 l = l.strip().split(",") @@ -54,10 +47,7 @@ class TrainReader(Reader): neg_index = rand_i neg_tag.append(neg_index) sum_n += 1 - # if n > 0 and len(text) > n: - # #yield None - # return None, None, None - return text, pos_tag, neg_tag + return text, pos_tag, neg_tag def generate_sample(self, line): def data_iter(): @@ -66,4 +56,5 @@ class TrainReader(Reader): yield None return yield [('text', text), ('pos_tag', pos_tag), ('neg_tag', neg_tag)] + return data_iter diff --git a/models/match/dssm/model.py b/models/match/dssm/model.py index ef77a8f01ebb2e0cf13551001ecb8f071f1ace2f..630fb3eeef062bdfda7720c2c54dd884ec033a71 100755 --- a/models/match/dssm/model.py +++ b/models/match/dssm/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -25,11 +24,12 @@ class Model(ModelBase): def input(self): TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) - Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace) + Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace) self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) - self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i in range(Neg)] + self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i + in range(Neg)] self._data_var.append(self.query) self._data_var.append(self.doc_pos) for input in self.doc_negs: @@ -38,40 +38,40 @@ class Model(ModelBase): if self._platform != "LINUX": self._data_loader = fluid.io.DataLoader.from_generator( feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) - def net(self, is_infer=False): - hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) + hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace) - + def fc(data, hidden_layers, hidden_acts, names): fc_inputs = [data] - for i in range(len(hidden_layers)): - xavier=fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i]) - out = fluid.layers.fc(input=fc_inputs[-1], - size=hidden_layers[i], - act=hidden_acts[i], - param_attr=xavier, - bias_attr=xavier, - name=names[i]) - fc_inputs.append(out) - return fc_inputs[-1] - - query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3']) - doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) - self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc) + for i in range(len(hidden_layers)): + xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i]) + out = fluid.layers.fc(input=fc_inputs[-1], + size=hidden_layers[i], + act=hidden_acts[i], + param_attr=xavier, + bias_attr=xavier, + name=names[i]) + fc_inputs.append(out) + return fc_inputs[-1] + + query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3']) + doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) + self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc) if is_infer: return R_Q_D_ns = [] - for i, doc_neg in enumerate(self.doc_negs): - doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, ['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)]) + for i, doc_neg in enumerate(self.doc_negs): + doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, + ['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)]) R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i)) concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1) - prob = fluid.layers.softmax(concat_Rs, axis=1) - - hit_prob = fluid.layers.slice(prob, axes=[0,1], starts=[0,0], ends=[4, 1]) + prob = fluid.layers.softmax(concat_Rs, axis=1) + + hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1]) loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob)) self.avg_cost = fluid.layers.mean(x=loss) @@ -101,10 +101,10 @@ class Model(ModelBase): self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self._infer_data_var = [self.query, self.doc_pos] - self._infer_data_loader = fluid.io.DataLoader.from_generator( + self._infer_data_loader = fluid.io.DataLoader.from_generator( feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + def infer_net(self): - self.infer_input() + self.infer_input() self.net(is_infer=True) - self.infer_results() + self.infer_results() diff --git a/models/match/dssm/synthetic_evaluate_reader.py b/models/match/dssm/synthetic_evaluate_reader.py index 1be5b13d48e17dfca1a0c11736074e80aadcbdc7..97f50abf9720060b008b90c7729e93d13701bb3b 100755 --- a/models/match/dssm/synthetic_evaluate_reader.py +++ b/models/match/dssm/synthetic_evaluate_reader.py @@ -14,7 +14,6 @@ from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class EvaluateReader(Reader): diff --git a/models/match/dssm/synthetic_reader.py b/models/match/dssm/synthetic_reader.py index c4cfaf6dab61b4122a86e946dc004847c70465fb..13f57a6663ca372bc287386dd939214f362b503d 100755 --- a/models/match/dssm/synthetic_reader.py +++ b/models/match/dssm/synthetic_reader.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class TrainReader(Reader): @@ -37,7 +37,7 @@ class TrainReader(Reader): neg_docs = [] for i in range(len(features) - 2): feature_names.append('doc_neg_' + str(i)) - neg_docs.append(map(float, features[i+2].split(','))) + neg_docs.append(map(float, features[i + 2].split(','))) yield zip(feature_names, [query] + [pos_doc] + neg_docs) diff --git a/models/match/multiview-simnet/data_process.sh b/models/match/multiview-simnet/data_process.sh index 15c6c908477cd3ba6a72a65bad039bb10295bd9c..91548c5863063f7a23cb2f713a3754b121b235b5 100755 --- a/models/match/multiview-simnet/data_process.sh +++ b/models/match/multiview-simnet/data_process.sh @@ -1,5 +1,20 @@ #! /bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + set -e echo "begin to prepare data" diff --git a/models/match/multiview-simnet/evaluate_reader.py b/models/match/multiview-simnet/evaluate_reader.py index b6aa5799c35845dfeea1c1fadca6e2a2e3164a2a..e0f8f9e43de80d003834056ea417914f1d10e898 100755 --- a/models/match/multiview-simnet/evaluate_reader.py +++ b/models/match/multiview-simnet/evaluate_reader.py @@ -11,18 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io -import copy -import random + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") - self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") self.all_slots = [] for i in range(self.query_slots): @@ -52,6 +49,7 @@ class EvaluateReader(Reader): if visit: self._all_slots_dict[slot][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) yield output + return data_iter diff --git a/models/match/multiview-simnet/generate_synthetic_data.py b/models/match/multiview-simnet/generate_synthetic_data.py index 98c93c5d831ef18f8b66352d477ebbfd48c47fae..d453e031cdca9be29892b913ea5f2636a6c05f5e 100755 --- a/models/match/multiview-simnet/generate_synthetic_data.py +++ b/models/match/multiview-simnet/generate_synthetic_data.py @@ -14,10 +14,12 @@ import random + class Dataset: def __init__(self): pass + class SyntheticDataset(Dataset): def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000): # ids are randomly generated @@ -39,7 +41,7 @@ class SyntheticDataset(Dataset): for i in range(self.query_slot_num): qslot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) - qslot = [str(fea) + ':' + str(i) for fea in qslot] + qslot = [str(fea) + ':' + str(i) for fea in qslot] query_slots += qslot for i in range(self.title_slot_num): pt_slot = generate_ids(self.ids_per_slot, @@ -50,7 +52,8 @@ class SyntheticDataset(Dataset): for i in range(self.title_slot_num): nt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) - nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in nt_slot] + nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in + nt_slot] neg_title_slots += nt_slot yield query_slots + pos_title_slots + neg_title_slots else: @@ -67,6 +70,7 @@ class SyntheticDataset(Dataset): def test(self): return self._reader_creator(False) + if __name__ == '__main__': sparse_feature_dim = 1000001 query_slots = 1 @@ -75,7 +79,7 @@ if __name__ == '__main__': dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size) train_reader = dataset.train() test_reader = dataset.test() - + with open("data/train/train.txt", 'w') as fout: for data in train_reader(): fout.write(' '.join(data)) diff --git a/models/match/multiview-simnet/model.py b/models/match/multiview-simnet/model.py index aa8d8db676155a3839496aa8096acc35cb784e90..5ba9fb5d05b27339d924bfe42c0e6ba0c2c68da3 100755 --- a/models/match/multiview-simnet/model.py +++ b/models/match/multiview-simnet/model.py @@ -12,16 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import math import paddle.fluid as fluid -import paddle.fluid.layers as layers import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.control_flow as cf from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase + class BowEncoder(object): """ bow-encoder """ @@ -97,13 +95,14 @@ class SimpleEncoderFactory(object): rnn_encode = GrnnEncoder(hidden_size=enc_hid_size) return rnn_encode + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) self.init_config() - + def init_config(self): - self._fetch_interval = 1 + self._fetch_interval = 1 query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace) title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace) query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace) @@ -115,19 +114,19 @@ class Model(ModelBase): factory.create(query_encoder, query_encode_dim) for i in range(query_slots) ] - self.title_encoders = [ + self.title_encoders = [ factory.create(title_encoder, title_encode_dim) for i in range(title_slots) ] - self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) - self.emb_shape = [self.emb_size, self.emb_dim] - self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) - self.margin = 0.1 + self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) + self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) + self.emb_shape = [self.emb_size, self.emb_dim] + self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) + self.margin = 0.1 def input(self, is_train=True): - self.q_slots = [ + self.q_slots = [ fluid.data( name="%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) @@ -138,22 +137,23 @@ class Model(ModelBase): for i in range(len(self.title_encoders)) ] - if is_train == False: - return self.q_slots + self.pt_slots + if is_train == False: + return self.q_slots + self.pt_slots self.nt_slots = [ fluid.data( - name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, dtype='int64') + name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, + dtype='int64') for i in range(len(self.title_encoders)) ] return self.q_slots + self.pt_slots + self.nt_slots - + def train_input(self): res = self.input() self._data_var = res - use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) if self._platform != "LINUX" or use_dataloader: self._data_loader = fluid.io.DataLoader.from_generator( @@ -161,15 +161,15 @@ class Model(ModelBase): def get_acc(self, x, y): less = tensor.cast(cf.less_than(x, y), dtype='float32') - label_ones = fluid.layers.fill_constant_batch_size_like( + label_ones = fluid.layers.fill_constant_batch_size_like( input=x, dtype='float32', shape=[-1, 1], value=1.0) correct = fluid.layers.reduce_sum(less) - total = fluid.layers.reduce_sum(label_ones) + total = fluid.layers.reduce_sum(label_ones) acc = fluid.layers.elementwise_div(correct, total) - return acc + return acc def net(self): - q_embs = [ + q_embs = [ fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in self.q_slots @@ -184,8 +184,8 @@ class Model(ModelBase): input=title, size=self.emb_shape, param_attr="emb") for title in self.nt_slots ] - - # encode each embedding field with encoder + + # encode each embedding field with encoder q_encodes = [ self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] @@ -201,7 +201,7 @@ class Model(ModelBase): pt_concat = fluid.layers.concat(pt_encodes) nt_concat = fluid.layers.concat(nt_encodes) - # projection of hidden layer + # projection of hidden layer q_hid = fluid.layers.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', @@ -219,7 +219,7 @@ class Model(ModelBase): cos_pos = fluid.layers.cos_sim(q_hid, pt_hid) cos_neg = fluid.layers.cos_sim(q_hid, nt_hid) - # pairwise hinge_loss + # pairwise hinge_loss loss_part1 = fluid.layers.elementwise_sub( tensor.fill_constant_batch_size_like( input=cos_pos, @@ -236,7 +236,7 @@ class Model(ModelBase): loss_part2) self.avg_cost = fluid.layers.mean(loss_part3) - self.acc = self.get_acc(cos_neg, cos_pos) + self.acc = self.get_acc(cos_neg, cos_pos) def avg_loss(self): self._cost = self.avg_cost @@ -253,19 +253,19 @@ class Model(ModelBase): def optimizer(self): learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) - optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) - return optimizer + optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) + return optimizer def infer_input(self): res = self.input(is_train=False) - self._infer_data_var = res + self._infer_data_var = res self._infer_data_loader = fluid.io.DataLoader.from_generator( feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + def infer_net(self): - self.infer_input() - # lookup embedding for each slot + self.infer_input() + # lookup embedding for each slot q_embs = [ fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") @@ -276,14 +276,14 @@ class Model(ModelBase): input=title, size=self.emb_shape, param_attr="emb") for title in self.pt_slots ] - # encode each embedding field with encoder + # encode each embedding field with encoder q_encodes = [ self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] pt_encodes = [ self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) ] - # concat multi view for query, pos_title, neg_title + # concat multi view for query, pos_title, neg_title q_concat = fluid.layers.concat(q_encodes) pt_concat = fluid.layers.concat(pt_encodes) # projection of hidden layer diff --git a/models/match/multiview-simnet/reader.py b/models/match/multiview-simnet/reader.py index ea41fa88e14b6202f007e9a07122af0eadadd1f6..43cd1a629a7540e727e423a98d497964203134ac 100755 --- a/models/match/multiview-simnet/reader.py +++ b/models/match/multiview-simnet/reader.py @@ -11,18 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io -import copy -import random + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") - self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") self.all_slots = [] for i in range(self.query_slots): @@ -55,6 +52,7 @@ class TrainReader(Reader): if visit: self._all_slots_dict[slot][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) yield output + return data_iter diff --git a/models/multitask/esmm/esmm_infer_reader.py b/models/multitask/esmm/esmm_infer_reader.py index 6e94a1eed07bab82fa80ece0041f9b1e94bb531d..8ca9eca67fdbb9e11f39db34b5dd9cfae518773b 100644 --- a/models/multitask/esmm/esmm_infer_reader.py +++ b/models/multitask/esmm/esmm_infer_reader.py @@ -13,19 +13,19 @@ # limitations under the License. from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs from collections import defaultdict -import numpy as np + +from paddlerec.core.reader import Reader class EvaluateReader(Reader): def init(self): - all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129', + all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', + '129', '205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] self.all_field_id_dict = defaultdict(int) - for i,field_id in enumerate(all_field_id): - self.all_field_id_dict[field_id] = [False,i] + for i, field_id in enumerate(all_field_id): + self.all_field_id_dict[field_id] = [False, i] def generate_sample(self, line): """ @@ -39,25 +39,26 @@ class EvaluateReader(Reader): features = line.strip().split(',') ctr = int(features[1]) cvr = int(features[2]) - + padding = 0 - output = [(field_id,[]) for field_id in self.all_field_id_dict] + output = [(field_id, []) for field_id in self.all_field_id_dict] for elem in features[4:]: - field_id,feat_id = elem.strip().split(':') + field_id, feat_id = elem.strip().split(':') if field_id not in self.all_field_id_dict: continue self.all_field_id_dict[field_id][0] = True index = self.all_field_id_dict[field_id][1] - output[index][1].append(int(feat_id)) - + output[index][1].append(int(feat_id)) + for field_id in self.all_field_id_dict: - visited,index = self.all_field_id_dict[field_id] + visited, index = self.all_field_id_dict[field_id] if visited: self.all_field_id_dict[field_id][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) output.append(('ctr', [ctr])) output.append(('cvr', [cvr])) yield output + return reader diff --git a/models/multitask/esmm/esmm_reader.py b/models/multitask/esmm/esmm_reader.py index f7aef643cd3db5c01c1cc3a632a99fa3fa342658..3d663038eefb4971b466336601ba436ff884e580 100644 --- a/models/multitask/esmm/esmm_reader.py +++ b/models/multitask/esmm/esmm_reader.py @@ -11,21 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs from collections import defaultdict -import numpy as np + +from paddlerec.core.reader import Reader class TrainReader(Reader): def init(self): - all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129', + all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', + '129', '205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] self.all_field_id_dict = defaultdict(int) - for i,field_id in enumerate(all_field_id): - self.all_field_id_dict[field_id] = [False,i] + for i, field_id in enumerate(all_field_id): + self.all_field_id_dict[field_id] = [False, i] def generate_sample(self, line): """ @@ -37,30 +38,31 @@ class TrainReader(Reader): This function needs to be implemented by the user, based on data format """ features = line.strip().split(',') - #ctr = list(map(int, features[1])) - #cvr = list(map(int, features[2])) + # ctr = list(map(int, features[1])) + # cvr = list(map(int, features[2])) ctr = int(features[1]) cvr = int(features[2]) - + padding = 0 - output = [(field_id,[]) for field_id in self.all_field_id_dict] + output = [(field_id, []) for field_id in self.all_field_id_dict] for elem in features[4:]: - field_id,feat_id = elem.strip().split(':') + field_id, feat_id = elem.strip().split(':') if field_id not in self.all_field_id_dict: continue self.all_field_id_dict[field_id][0] = True index = self.all_field_id_dict[field_id][1] - #feat_id = list(map(int, feat_id)) - output[index][1].append(int(feat_id)) - + # feat_id = list(map(int, feat_id)) + output[index][1].append(int(feat_id)) + for field_id in self.all_field_id_dict: - visited,index = self.all_field_id_dict[field_id] + visited, index = self.all_field_id_dict[field_id] if visited: self.all_field_id_dict[field_id][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) output.append(('ctr', [ctr])) output.append(('cvr', [cvr])) yield output + return reader diff --git a/models/multitask/esmm/model.py b/models/multitask/esmm/model.py index 1641f72d1e4eab39cfe7ce1aa5055c25d139fb16..8a8a203a87504cff310c0a799df40e937e2bbde8 100644 --- a/models/multitask/esmm/model.py +++ b/models/multitask/esmm/model.py @@ -12,83 +12,84 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math +import numpy as np import paddle.fluid as fluid from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase -import numpy as np class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - def fc(self,tag, data, out_dim, active='prelu'): - + def fc(self, tag, data, out_dim, active='prelu'): + init_stddev = 1.0 - scales = 1.0 / np.sqrt(data.shape[1]) - + scales = 1.0 / np.sqrt(data.shape[1]) + p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales)) - + initializer=fluid.initializer.NormalInitializer(loc=0.0, + scale=init_stddev * scales)) + b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) - + out = fluid.layers.fc(input=data, - size=out_dim, - act=active, - param_attr=p_attr, - bias_attr =b_attr, - name=tag) + size=out_dim, + act=active, + param_attr=p_attr, + bias_attr=b_attr, + name=tag) return out - + def input_data(self): sparse_input_ids = [ - fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0,23) + fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23) ] label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64") label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64") inputs = sparse_input_ids + [label_ctr] + [label_cvr] self._data_var.extend(inputs) - + return inputs - + def net(self, inputs, is_infer=False): - + vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace) emb = [] for data in inputs[0:-2]: feat_emb = fluid.embedding(input=data, - size=[vocab_size, embed_size], - param_attr=fluid.ParamAttr(name='dis_emb', - learning_rate=5, - initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size) - ), - is_sparse=True) - field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum') + size=[vocab_size, embed_size], + param_attr=fluid.ParamAttr(name='dis_emb', + learning_rate=5, + initializer=fluid.initializer.Xavier( + fan_in=embed_size, fan_out=embed_size) + ), + is_sparse=True) + field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum') emb.append(field_emb) concat_emb = fluid.layers.concat(emb, axis=1) - + # ctr active = 'relu' ctr_fc1 = self.fc('ctr_fc1', concat_emb, 200, active) ctr_fc2 = self.fc('ctr_fc2', ctr_fc1, 80, active) ctr_out = self.fc('ctr_out', ctr_fc2, 2, 'softmax') - + # cvr cvr_fc1 = self.fc('cvr_fc1', concat_emb, 200, active) cvr_fc2 = self.fc('cvr_fc2', cvr_fc1, 80, active) - cvr_out = self.fc('cvr_out', cvr_fc2, 2,'softmax') - + cvr_out = self.fc('cvr_out', cvr_fc2, 2, 'softmax') + ctr_clk = inputs[-2] ctcvr_buy = inputs[-1] - + ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2]) cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) - + ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) - ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1) + ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1) auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) @@ -98,27 +99,23 @@ class Model(ModelBase): self._infer_results["AUC_ctcvr"] = auc_ctcvr return - loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) cost = loss_ctr + loss_ctcvr avg_cost = fluid.layers.mean(cost) - self._cost = avg_cost self._metrics["AUC_ctr"] = auc_ctr self._metrics["BATCH_AUC_ctr"] = batch_auc_ctr self._metrics["AUC_ctcvr"] = auc_ctcvr self._metrics["BATCH_AUC_ctcvr"] = batch_auc_ctcvr - def train_net(self): input_data = self.input_data() self.net(input_data) - def infer_net(self): self._infer_data_var = self.input_data() self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) self.net(self._infer_data_var, is_infer=True) diff --git a/models/multitask/mmoe/census_infer_reader.py b/models/multitask/mmoe/census_infer_reader.py index c25ccea8c4416bbfe64d3cdda59f5ee13b0cfac1..c62de8e69ce6ccfbb4df1e1252d9630a84fc56b3 100644 --- a/models/multitask/mmoe/census_infer_reader.py +++ b/models/multitask/mmoe/census_infer_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class EvaluateReader(Reader): diff --git a/models/multitask/mmoe/census_reader.py b/models/multitask/mmoe/census_reader.py index 30997c632302c2a6e963099c9e2679978c9d9b61..211e566882e5d8a7f50f22b0a1628307777099c8 100644 --- a/models/multitask/mmoe/census_reader.py +++ b/models/multitask/mmoe/census_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class TrainReader(Reader): @@ -44,8 +43,8 @@ class TrainReader(Reader): label_marital = [1, 0] elif int(l[0]) == 1: label_marital = [0, 1] - #label_income = np.array(label_income) - #label_marital = np.array(label_marital) + # label_income = np.array(label_income) + # label_marital = np.array(label_marital) feature_name = ["input", "label_income", "label_marital"] yield zip(feature_name, [data] + [label_income] + [label_marital]) diff --git a/models/multitask/mmoe/model.py b/models/multitask/mmoe/model.py index 753bacae77f089dc709e34812ef109ac06aebcfb..525e9d5cc0086757901262253cf0f23ee72f314c 100644 --- a/models/multitask/mmoe/model.py +++ b/models/multitask/mmoe/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -37,22 +36,21 @@ class Model(ModelBase): if is_infer: self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + self._data_var.extend([input_data, label_income, label_marital]) # f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper expert_outputs = [] for i in range(0, expert_num): expert_output = fluid.layers.fc(input=input_data, - size=expert_size, - act='relu', - bias_attr=fluid.ParamAttr(learning_rate=1.0), - name='expert_' + str(i)) + size=expert_size, + act='relu', + bias_attr=fluid.ParamAttr(learning_rate=1.0), + name='expert_' + str(i)) expert_outputs.append(expert_output) expert_concat = fluid.layers.concat(expert_outputs, axis=1) - expert_concat = fluid.layers.reshape(expert_concat,[-1, expert_num, expert_size]) - - + expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size]) + # g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper output_layers = [] for i in range(0, gate_num): @@ -62,52 +60,53 @@ class Model(ModelBase): bias_attr=fluid.ParamAttr(learning_rate=1.0), name='gate_' + str(i)) # f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x)) - cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0) + cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0) cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1) # Build tower layer - cur_tower = fluid.layers.fc(input=cur_gate_expert, - size=tower_size, - act='relu', - name='task_layer_' + str(i)) - out = fluid.layers.fc(input=cur_tower, - size=2, - act='softmax', - name='out_' + str(i)) - + cur_tower = fluid.layers.fc(input=cur_gate_expert, + size=tower_size, + act='relu', + name='task_layer_' + str(i)) + out = fluid.layers.fc(input=cur_tower, + size=2, + act='softmax', + name='out_' + str(i)) + output_layers.append(out) pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) - label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2]) - - auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64')) - auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64')) + + auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, + label=fluid.layers.cast(x=label_income_1, + dtype='int64')) + auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, + label=fluid.layers.cast(x=label_marital_1, + dtype='int64')) if is_infer: self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_marital"] = auc_marital return - cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True) - cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True) - + cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) + cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) + avg_cost_income = fluid.layers.mean(x=cost_income) avg_cost_marital = fluid.layers.mean(x=cost_marital) - - cost = avg_cost_income + avg_cost_marital - + + cost = avg_cost_income + avg_cost_marital + self._cost = cost self._metrics["AUC_income"] = auc_income self._metrics["BATCH_AUC_income"] = batch_auc_1 self._metrics["AUC_marital"] = auc_marital self._metrics["BATCH_AUC_marital"] = batch_auc_2 - def train_net(self): self.MMOE() - def infer_net(self): self.MMOE(is_infer=True) diff --git a/models/multitask/share-bottom/census_infer_reader.py b/models/multitask/share-bottom/census_infer_reader.py index c25ccea8c4416bbfe64d3cdda59f5ee13b0cfac1..c62de8e69ce6ccfbb4df1e1252d9630a84fc56b3 100644 --- a/models/multitask/share-bottom/census_infer_reader.py +++ b/models/multitask/share-bottom/census_infer_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class EvaluateReader(Reader): diff --git a/models/multitask/share-bottom/census_reader.py b/models/multitask/share-bottom/census_reader.py index 30997c632302c2a6e963099c9e2679978c9d9b61..211e566882e5d8a7f50f22b0a1628307777099c8 100644 --- a/models/multitask/share-bottom/census_reader.py +++ b/models/multitask/share-bottom/census_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class TrainReader(Reader): @@ -44,8 +43,8 @@ class TrainReader(Reader): label_marital = [1, 0] elif int(l[0]) == 1: label_marital = [0, 1] - #label_income = np.array(label_income) - #label_marital = np.array(label_marital) + # label_income = np.array(label_income) + # label_marital = np.array(label_marital) feature_name = ["input", "label_income", "label_marital"] yield zip(feature_name, [data] + [label_income] + [label_marital]) diff --git a/models/multitask/share-bottom/model.py b/models/multitask/share-bottom/model.py index a91729953421704986725ccd3540fa2fc29e27e4..d570ba77067985b518247c8f6bba16a6431e1f9c 100644 --- a/models/multitask/share-bottom/model.py +++ b/models/multitask/share-bottom/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -33,65 +32,65 @@ class Model(ModelBase): input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) - + if is_infer: self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) self._data_var.extend([input_data, label_income, label_marital]) bottom_output = fluid.layers.fc(input=input_data, - size=bottom_size, - act='relu', - bias_attr=fluid.ParamAttr(learning_rate=1.0), - name='bottom_output') - - + size=bottom_size, + act='relu', + bias_attr=fluid.ParamAttr(learning_rate=1.0), + name='bottom_output') + # Build tower layer from bottom layer output_layers = [] - for index in range(tower_nums): + for index in range(tower_nums): tower_layer = fluid.layers.fc(input=bottom_output, - size=tower_size, - act='relu', - name='task_layer_' + str(index)) + size=tower_size, + act='relu', + name='task_layer_' + str(index)) output_layer = fluid.layers.fc(input=tower_layer, - size=2, - act='softmax', - name='output_layer_' + str(index)) + size=2, + act='softmax', + name='output_layer_' + str(index)) output_layers.append(output_layer) - pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2]) - - auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64')) - auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64')) + + auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, + label=fluid.layers.cast(x=label_income_1, + dtype='int64')) + auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, + label=fluid.layers.cast(x=label_marital_1, + dtype='int64')) if is_infer: self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_marital"] = auc_marital return - cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True) - cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True) + cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) + cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1) - - avg_cost = fluid.layers.mean(x=cost) - + + avg_cost = fluid.layers.mean(x=cost) + self._cost = avg_cost self._metrics["AUC_income"] = auc_income self._metrics["BATCH_AUC_income"] = batch_auc_1 self._metrics["AUC_marital"] = auc_marital self._metrics["BATCH_AUC_marital"] = batch_auc_2 - def train_net(self): self.model() - def infer_net(self): self.model(is_infer=True) diff --git a/models/rank/criteo_reader.py b/models/rank/criteo_reader.py index 3d8fcf96a4be04f60fb9928e4db2190b306813ff..75994fb43f6ee3a72ab2aae25c36e0591c530fee 100755 --- a/models/rank/criteo_reader.py +++ b/models/rank/criteo_reader.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader diff --git a/models/rank/dcn/criteo_reader.py b/models/rank/dcn/criteo_reader.py index a526d0fcbf2ce303dcc537b1ed8a6b0f50b73506..f136f6d933ed510e33cb645b365e7b68d1206237 100755 --- a/models/rank/dcn/criteo_reader.py +++ b/models/rank/dcn/criteo_reader.py @@ -11,18 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function import math -import sys +import os -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs try: import cPickle as pickle except ImportError: import pickle -from collections import Counter -import os + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs + class TrainReader(Reader): def init(self): @@ -45,7 +46,7 @@ class TrainReader(Reader): self.label_feat_names = target + dense_feat_names + sparse_feat_names self.cat_feat_idx_dict_list = [{} for _ in range(26)] - + # TODO: set vocabulary dictionary vocab_dir = envs.get_global_env("feat_dict_name", None, "train.reader") for i in range(26): @@ -53,7 +54,7 @@ class TrainReader(Reader): for line in open( os.path.join(vocab_dir, 'C' + str(i + 1) + '.txt')): self.cat_feat_idx_dict_list[i][line.strip()] = lookup_idx - lookup_idx += 1 + lookup_idx += 1 def _process_line(self, line): features = line.rstrip('\n').split('\t') @@ -71,20 +72,21 @@ class TrainReader(Reader): if idx == 2 else math.log(1 + float(features[idx]))) for idx in self.cat_idx_: if features[idx] == '' or features[ - idx] not in self.cat_feat_idx_dict_list[idx - 14]: + idx] not in self.cat_feat_idx_dict_list[idx - 14]: label_feat_list[idx].append(0) else: label_feat_list[idx].append(self.cat_feat_idx_dict_list[ - idx - 14][features[idx]]) + idx - 14][features[idx]]) label_feat_list[0].append(int(features[0])) return label_feat_list - + def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): label_feat_list = self._process_line(line) yield list(zip(self.label_feat_names, label_feat_list)) - return data_iter \ No newline at end of file + return data_iter diff --git a/models/rank/dcn/model.py b/models/rank/dcn/model.py index 7fbb3c6a2a2120159e9a0557a6583731d1d549eb..23968503147c1effd6b5d492b85f0536f0c45951 100755 --- a/models/rank/dcn/model.py +++ b/models/rank/dcn/model.py @@ -12,17 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import OrderedDict + import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase -from collections import OrderedDict + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - + def init_network(self): self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace) self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace) @@ -49,7 +50,7 @@ class Model(ModelBase): self.net_input = None self.loss = None - + def _create_embedding_input(self, data_dict): # sparse embedding sparse_emb_dict = OrderedDict((name, fluid.embedding( @@ -77,7 +78,7 @@ class Model(ModelBase): net_input = fluid.layers.concat([dense_input, sparse_input], axis=-1) return net_input - + def _deep_net(self, input, hidden_units, use_bn=False, is_test=False): for units in hidden_units: input = fluid.layers.fc(input=input, size=units) @@ -94,7 +95,7 @@ class Model(ModelBase): [input_dim], dtype='float32', name=prefix + "_b") xw = fluid.layers.reduce_sum(x * w, dim=1, keep_dim=True) # (N, 1) return x0 * xw + b + x, w - + def _cross_net(self, input, num_corss_layers): x = x0 = input l2_reg_cross_list = [] @@ -105,10 +106,10 @@ class Model(ModelBase): fluid.layers.concat( l2_reg_cross_list, axis=-1)) return x, l2_reg_cross_loss - + def _l2_loss(self, w): return fluid.layers.reduce_sum(fluid.layers.square(w)) - + def train_net(self): self.init_network() self.target_input = fluid.data( @@ -117,14 +118,14 @@ class Model(ModelBase): for feat_name in self.feat_dims_dict: data_dict[feat_name] = fluid.data( name=feat_name, shape=[None, 1], dtype='float32') - + self.net_input = self._create_embedding_input(data_dict) - + deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False) cross_out, l2_reg_cross_loss = self._cross_net(self.net_input, - self.cross_num) - + self.cross_num) + last_out = fluid.layers.concat([deep_out, cross_out], axis=-1) logit = fluid.layers.fc(last_out, 1) @@ -140,7 +141,6 @@ class Model(ModelBase): input=prob_2d, label=label_int, slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - # logloss logloss = fluid.layers.log_loss(self.prob, self.target_input) diff --git a/models/rank/deepfm/criteo_reader.py b/models/rank/deepfm/criteo_reader.py index ae942e0a82eec7d90b0dab618310dc900985a4ee..7ed215bd763283076c66f6bd6ef0e66e132ef5d4 100755 --- a/models/rank/deepfm/criteo_reader.py +++ b/models/rank/deepfm/criteo_reader.py @@ -11,15 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs try: import cPickle as pickle except ImportError: import pickle +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs + + class TrainReader(Reader): def init(self): self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] @@ -35,7 +38,7 @@ class TrainReader(Reader): self.categorical_range_ = range(14, 40) # load preprocessed feature dict self.feat_dict_name = envs.get_global_env("feat_dict_name", None, "train.reader") - self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) + self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) def _process_line(self, line): features = line.rstrip('\n').split('\t') @@ -59,13 +62,14 @@ class TrainReader(Reader): feat_value.append(1.0) label = [int(features[0])] return feat_idx, feat_value, label - + def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): feat_idx, feat_value, label = self._process_line(line) yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)] - return data_iter \ No newline at end of file + return data_iter diff --git a/models/rank/deepfm/model.py b/models/rank/deepfm/model.py index 8636078d87e65eb3c7acacf528e0592d10506a45..3c5b128044ef050d745df9380cade04f1a88bf5c 100755 --- a/models/rank/deepfm/model.py +++ b/models/rank/deepfm/model.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid as fluid import math +import paddle.fluid as fluid + from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -28,26 +29,27 @@ class Model(ModelBase): is_distributed = True if envs.get_trainer() == "CtrTrainer" else False sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - + # ------------------------- network input -------------------------- - + num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) - raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field], dtype='int64') # None * num_field(defalut:39) - raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32') # None * num_field + raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field], + dtype='int64') # None * num_field(defalut:39) + raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32') # None * num_field self.label = fluid.data(name='label', shape=[None, 1], dtype='float32') # None * 1 - feat_idx = fluid.layers.reshape(raw_feat_idx,[-1, 1]) # (None * num_field) * 1 + feat_idx = fluid.layers.reshape(raw_feat_idx, [-1, 1]) # (None * num_field) * 1 feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 - + # ------------------------- set _data_var -------------------------- - + self._data_var.append(raw_feat_idx) self._data_var.append(raw_feat_value) self._data_var.append(self.label) if self._platform != "LINUX": self._data_loader = fluid.io.DataLoader.from_generator( feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) - - #------------------------- first order term -------------------------- + + # ------------------------- first order term -------------------------- reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace) first_weights_re = fluid.embedding( @@ -65,7 +67,7 @@ class Model(ModelBase): first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1 y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1) - #------------------------- second order term -------------------------- + # ------------------------- second order term -------------------------- feat_embeddings_re = fluid.embedding( input=feat_idx, @@ -80,12 +82,12 @@ class Model(ModelBase): feat_embeddings = fluid.layers.reshape( feat_embeddings_re, shape=[-1, num_field, - sparse_feature_dim]) # None * num_field * embedding_size + sparse_feature_dim]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size - + # sum_square part summed_features_emb = fluid.layers.reduce_sum(feat_embeddings, - 1) # None * embedding_size + 1) # None * embedding_size summed_features_emb_square = fluid.layers.square( summed_features_emb) # None * embedding_size @@ -99,13 +101,12 @@ class Model(ModelBase): summed_features_emb_square - squared_sum_features_emb, 1, keep_dim=True) # None * 1 - - #------------------------- DNN -------------------------- + # ------------------------- DNN -------------------------- layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) act = envs.get_global_env("hyper_parameters.act", None, self._namespace) y_dnn = fluid.layers.reshape(feat_embeddings, - [-1, num_field * sparse_feature_dim]) + [-1, num_field * sparse_feature_dim]) for s in layer_sizes: y_dnn = fluid.layers.fc( input=y_dnn, @@ -127,28 +128,28 @@ class Model(ModelBase): bias_attr=fluid.ParamAttr( initializer=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=init_value_))) - - #------------------------- DeepFM -------------------------- + + # ------------------------- DeepFM -------------------------- self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) - + def train_net(self): self.deepfm_net() - - #------------------------- Cost(logloss) -------------------------- + + # ------------------------- Cost(logloss) -------------------------- cost = fluid.layers.log_loss(input=self.predict, label=self.label) avg_cost = fluid.layers.reduce_sum(cost) - + self._cost = avg_cost - #------------------------- Metric(Auc) -------------------------- - + # ------------------------- Metric(Auc) -------------------------- + predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) label_int = fluid.layers.cast(self.label, 'int64') auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, - label=label_int, - slide_steps=0) + label=label_int, + slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var @@ -158,4 +159,4 @@ class Model(ModelBase): return optimizer def infer_net(self, parameter_list): - self.deepfm_net() \ No newline at end of file + self.deepfm_net() diff --git a/models/rank/din/model.py b/models/rank/din/model.py index e218bb7f5b20b549fab42e74e75ecadbae8499c3..2abc658b6d5cb58aaff222e1121d2c4282bcd65f 100755 --- a/models/rank/din/model.py +++ b/models/rank/din/model.py @@ -13,7 +13,6 @@ # limitations under the License. import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase diff --git a/models/rank/din/reader.py b/models/rank/din/reader.py index 696e3d2502d1dfa5b53b70ca9f5a497f18a1ae5d..39ed690fdc6fd35d50ebdcb46b5becc5ae399b62 100755 --- a/models/rank/din/reader.py +++ b/models/rank/din/reader.py @@ -13,24 +13,28 @@ # limitations under the License. from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np import os import random + try: import cPickle as pickle except ImportError: import pickle +import numpy as np + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs + + class TrainReader(Reader): def init(self): self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader") self.res = [] self.max_len = 0 - + data_file_list = os.listdir(self.train_data_path) - for i in range(0, len(data_file_list)): + for i in range(0, len(data_file_list)): train_data_file = os.path.join(self.train_data_path, data_file_list[i]) with open(train_data_file, "r") as fin: for line in fin: @@ -43,9 +47,6 @@ class TrainReader(Reader): self.batch_size = envs.get_global_env("batch_size", 32, "train.reader") self.group_size = self.batch_size * 20 - - - def _process_line(self, line): line = line.strip().split(';') hist = line[0].split() @@ -54,22 +55,22 @@ class TrainReader(Reader): cate = [int(i) for i in cate] return [hist, cate, [int(line[2])], [int(line[3])], [float(line[4])]] - def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): - #feat_idx, feat_value, label = self._process_line(line) + # feat_idx, feat_value, label = self._process_line(line) yield self._process_line(line) return data_iter - + def pad_batch_data(self, input, max_len): res = np.array([x + [0] * (max_len - len(x)) for x in input]) res = res.astype("int64").reshape([-1, max_len]) return res - + def make_data(self, b): max_len = max(len(x[0]) for x in b) item = self.pad_batch_data([x[0] for x in b], max_len) @@ -77,7 +78,7 @@ class TrainReader(Reader): len_array = [len(x[0]) for x in b] mask = np.array( [[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape( - [-1, max_len, 1]) + [-1, max_len, 1]) target_item_seq = np.array( [[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) target_cat_seq = np.array( @@ -89,7 +90,7 @@ class TrainReader(Reader): target_item_seq[i], target_cat_seq[i] ]) return res - + def batch_reader(self, reader, batch_size, group_size): def batch_reader(): bg = [] @@ -111,7 +112,7 @@ class TrainReader(Reader): yield self.make_data(b) return batch_reader - + def base_read(self, file_dir): res = [] for train_file in file_dir: @@ -122,10 +123,8 @@ class TrainReader(Reader): cate = line[1].split() res.append([hist, cate, line[2], line[3], float(line[4])]) return res - + def generate_batch_from_trainfiles(self, files): data_set = self.base_read(files) random.shuffle(data_set) return self.batch_reader(data_set, self.batch_size, self.batch_size * 20) - - diff --git a/models/rank/dnn/model.py b/models/rank/dnn/model.py index 7a5ed0d421d851852bbe80ae9461c4dc6bbc6627..67bf61a04f0e93067f7a23fa3699ad08e6ce137f 100755 --- a/models/rank/dnn/model.py +++ b/models/rank/dnn/model.py @@ -13,6 +13,7 @@ # limitations under the License. import math + import paddle.fluid as fluid from paddlerec.core.utils import envs diff --git a/models/rank/wide_deep/model.py b/models/rank/wide_deep/model.py index 8e39cafc5f59a9c21486f826566dcda3951ec51a..2e932023b157d4a20f13b5e6637960e399a7d235 100755 --- a/models/rank/wide_deep/model.py +++ b/models/rank/wide_deep/model.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid as fluid import math +import paddle.fluid as fluid + from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -22,32 +23,39 @@ from paddlerec.core.model import Model as ModelBase class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - + def wide_part(self, data): out = fluid.layers.fc(input=data, - size=1, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])), - regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), - act=None, - name='wide') + size=1, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, + scale=1.0 / math.sqrt( + data.shape[ + 1])), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4)), + act=None, + name='wide') return out - + def fc(self, data, hidden_units, active, tag): output = fluid.layers.fc(input=data, - size=hidden_units, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))), - act=active, - name=tag) - + size=hidden_units, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, + scale=1.0 / math.sqrt( + data.shape[ + 1]))), + act=active, + name=tag) + return output - + def deep_part(self, data, hidden1_units, hidden2_units, hidden3_units): l1 = self.fc(data, hidden1_units, 'relu', 'l1') l2 = self.fc(l1, hidden2_units, 'relu', 'l2') l3 = self.fc(l2, hidden3_units, 'relu', 'l3') - + return l3 - + def train_net(self): wide_input = fluid.data(name='wide_input', shape=[None, 8], dtype='float32') deep_input = fluid.data(name='deep_input', shape=[None, 58], dtype='float32') @@ -61,31 +69,33 @@ class Model(ModelBase): hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", 25, self._namespace) wide_output = self.wide_part(wide_input) deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, hidden3_units) - + wide_model = fluid.layers.fc(input=wide_output, - size=1, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), - act=None, - name='w_wide') - + size=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), + act=None, + name='w_wide') + deep_model = fluid.layers.fc(input=deep_output, - size=1, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), - act=None, - name='w_deep') - + size=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), + act=None, + name='w_deep') + prediction = fluid.layers.elementwise_add(wide_model, deep_model) pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction") num_seqs = fluid.layers.create_tensor(dtype='int64') acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=label, dtype='int64'), total=num_seqs) auc_var, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=label, dtype='int64')) - + self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc self._metrics["ACC"] = acc - cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=label) + cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=label) avg_cost = fluid.layers.mean(cost) self._cost = avg_cost @@ -95,4 +105,4 @@ class Model(ModelBase): return optimizer def infer_net(self, parameter_list): - self.deepfm_net() \ No newline at end of file + self.deepfm_net() diff --git a/models/rank/wide_deep/reader.py b/models/rank/wide_deep/reader.py index 945dd4a5084ea0eb0ef4b9cd21d640d95f29c5e1..f783ad2fb7f386e48af569062286a9afaa9dbe90 100755 --- a/models/rank/wide_deep/reader.py +++ b/models/rank/wide_deep/reader.py @@ -11,15 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs try: import cPickle as pickle except ImportError: import pickle +from paddlerec.core.reader import Reader + + class TrainReader(Reader): def init(self): pass @@ -28,16 +30,17 @@ class TrainReader(Reader): line = line.strip().split(',') features = list(map(float, line)) wide_feat = features[0:8] - deep_feat = features[8:58+8] + deep_feat = features[8:58 + 8] label = features[-1] return wide_feat, deep_feat, [label] - + def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): wide_feat, deep_deat, label = self._process_line(line) yield [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)] - return data_iter \ No newline at end of file + return data_iter diff --git a/models/rank/xdeepfm/criteo_reader.py b/models/rank/xdeepfm/criteo_reader.py index ee2dac5da5739cfbf0b832ba5d09ecd9d878b442..fe4542d5d46227a0b6eaa4a96737216bd68628a4 100755 --- a/models/rank/xdeepfm/criteo_reader.py +++ b/models/rank/xdeepfm/criteo_reader.py @@ -11,19 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs try: import cPickle as pickle except ImportError: import pickle -class TrainReader(Reader): +from paddlerec.core.reader import Reader + + +class TrainReader(Reader): def init(self): pass - + def _process_line(self, line): features = line.strip('\n').split('\t') feat_idx = [] @@ -33,11 +35,11 @@ class TrainReader(Reader): feat_value.append(1.0) label = [int(features[0])] return feat_idx, feat_value, label - + def generate_sample(self, line): def data_iter(): feat_idx, feat_value, label = self._process_line(line) yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)] - return data_iter \ No newline at end of file + return data_iter diff --git a/models/rank/xdeepfm/model.py b/models/rank/xdeepfm/model.py index d92ce81e3ec755b5d5200210ca4009bbf52a65eb..213c29b5acdcdb0dd26303b52858e1a8e28073e7 100755 --- a/models/rank/xdeepfm/model.py +++ b/models/rank/xdeepfm/model.py @@ -13,7 +13,6 @@ # limitations under the License. import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -27,13 +26,13 @@ class Model(ModelBase): init_value_ = 0.1 initer = fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=init_value_) - + is_distributed = True if envs.get_trainer() == "CtrTrainer" else False sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - + # ------------------------- network input -------------------------- - + num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) raw_feat_idx = fluid.data(name='feat_idx', shape=[None, num_field], dtype='int64') raw_feat_value = fluid.data(name='feat_value', shape=[None, num_field], dtype='float32') @@ -52,16 +51,16 @@ class Model(ModelBase): feat_embeddings, [-1, num_field, sparse_feature_dim]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size - + # ------------------------- set _data_var -------------------------- - + self._data_var.append(raw_feat_idx) self._data_var.append(raw_feat_value) self._data_var.append(self.label) if self._platform != "LINUX": self._data_loader = fluid.io.DataLoader.from_generator( feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) - + # -------------------- linear -------------------- weights_linear = fluid.embedding( @@ -79,7 +78,7 @@ class Model(ModelBase): default_initializer=fluid.initializer.ConstantInitializer(value=0)) y_linear = fluid.layers.reduce_sum( (weights_linear * feat_value), 1) + b_linear - + # -------------------- CIN -------------------- layer_sizes_cin = envs.get_global_env("hyper_parameters.layer_sizes_cin", None, self._namespace) @@ -90,7 +89,7 @@ class Model(ModelBase): X_0 = fluid.layers.reshape( fluid.layers.transpose(Xs[0], [0, 2, 1]), [-1, sparse_feature_dim, num_field, - 1]) # None, embedding_size, num_field, 1 + 1]) # None, embedding_size, num_field, 1 X_k = fluid.layers.reshape( fluid.layers.transpose(Xs[-1], [0, 2, 1]), [-1, sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s @@ -136,7 +135,7 @@ class Model(ModelBase): layer_sizes_dnn = envs.get_global_env("hyper_parameters.layer_sizes_dnn", None, self._namespace) act = envs.get_global_env("hyper_parameters.act", None, self._namespace) y_dnn = fluid.layers.reshape(feat_embeddings, - [-1, num_field * sparse_feature_dim]) + [-1, num_field * sparse_feature_dim]) for s in layer_sizes_dnn: y_dnn = fluid.layers.fc(input=y_dnn, size=s, @@ -152,7 +151,7 @@ class Model(ModelBase): # ------------------- xDeepFM ------------------ self.predict = fluid.layers.sigmoid(y_linear + y_cin + y_dnn) - + def train_net(self): self.xdeepfm_net() @@ -164,15 +163,15 @@ class Model(ModelBase): predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) label_int = fluid.layers.cast(self.label, 'int64') auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, - label=label_int, - slide_steps=0) + label=label_int, + slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - + def optimizer(self): learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) return optimizer def infer_net(self, parameter_list): - self.xdeepfm_net() \ No newline at end of file + self.xdeepfm_net() diff --git a/models/recall/gnn/data_process.sh b/models/recall/gnn/data_process.sh index 9aa009f03cf2595ea0cb54e691800486f26a21bf..38877b6906ecd65ef190aae5f1dcf5a74cece6d0 100755 --- a/models/recall/gnn/data_process.sh +++ b/models/recall/gnn/data_process.sh @@ -1,5 +1,19 @@ #! /bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -e echo "begin to download data" diff --git a/models/recall/gnn/evaluate_reader.py b/models/recall/gnn/evaluate_reader.py index 3f6287767dbaad7dc9a2539347ad73b16a9f87de..904140c2febf5164592348d0b4e8f90f197bbf06 100755 --- a/models/recall/gnn/evaluate_reader.py +++ b/models/recall/gnn/evaluate_reader.py @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io + import copy import random + +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs @@ -22,17 +24,17 @@ from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): self.batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") - + self.input = [] self.length = None def base_read(self, files): res = [] for f in files: - with open(f, "r") as fin: + with open(f, "r") as fin: for line in fin: - line = line.strip().split('\t') - res.append(tuple([map(int, line[0].split(',')), int(line[1])])) + line = line.strip().split('\t') + res.append(tuple([map(int, line[0].split(',')), int(line[1])])) return res def make_data(self, cur_batch, batch_size): @@ -120,10 +122,11 @@ class EvaluateReader(Reader): else: # Due to fixed batch_size, discard the remaining ins return - #cur_batch = remain_data[i:] - #yield self.make_data(cur_batch, group_remain % batch_size) + # cur_batch = remain_data[i:] + # yield self.make_data(cur_batch, group_remain % batch_size) + return _reader - + def generate_batch_from_trainfiles(self, files): self.input = self.base_read(files) self.length = len(self.input) @@ -132,4 +135,5 @@ class EvaluateReader(Reader): def generate_sample(self, line): def data_iter(): yield [] + return data_iter diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py index ac628ec6209e9489ac20550a7837b60c9e8e3771..b98625a6afc094e106b26d1e2b31a8712a9d7b94 100755 --- a/models/recall/gnn/model.py +++ b/models/recall/gnn/model.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import math +import numpy as np + import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -25,19 +26,19 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) self.init_config() - + def init_config(self): self._fetch_interval = 1 - self.items_num, self.ins_num = self.config_read(envs.get_global_env("hyper_parameters.config_path", None, self._namespace)) + self.items_num, self.ins_num = self.config_read( + envs.get_global_env("hyper_parameters.config_path", None, self._namespace)) self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader") self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace) - def config_read(self, config_path=None): - if config_path is None: - raise ValueError("please set train.model.hyper_parameters.config_path at first") + if config_path is None: + raise ValueError("please set train.model.hyper_parameters.config_path at first") with open(config_path, "r") as fin: item_nums = int(fin.readline().strip()) ins_nums = int(fin.readline().strip()) @@ -47,49 +48,49 @@ class Model(ModelBase): self.items = fluid.data( name="items", shape=[bs, -1], - dtype="int64") #[batch_size, uniq_max] + dtype="int64") # [batch_size, uniq_max] self.seq_index = fluid.data( name="seq_index", shape=[bs, -1, 2], - dtype="int32") #[batch_size, seq_max, 2] + dtype="int32") # [batch_size, seq_max, 2] self.last_index = fluid.data( name="last_index", shape=[bs, 2], - dtype="int32") #[batch_size, 2] + dtype="int32") # [batch_size, 2] self.adj_in = fluid.data( name="adj_in", shape=[bs, -1, -1], - dtype="float32") #[batch_size, seq_max, seq_max] + dtype="float32") # [batch_size, seq_max, seq_max] self.adj_out = fluid.data( name="adj_out", shape=[bs, -1, -1], - dtype="float32") #[batch_size, seq_max, seq_max] + dtype="float32") # [batch_size, seq_max, seq_max] self.mask = fluid.data( name="mask", shape=[bs, -1, 1], - dtype="float32") #[batch_size, seq_max, 1] + dtype="float32") # [batch_size, seq_max, 1] self.label = fluid.data( name="label", shape=[bs, 1], - dtype="int64") #[batch_size, 1] + dtype="int64") # [batch_size, 1] res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label] return res - + def train_input(self): res = self.input(self.train_batch_size) self._data_var = res - use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) if self._platform != "LINUX" or use_dataloader: self._data_loader = fluid.io.DataLoader.from_generator( feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) def net(self, items_num, hidden_size, step, bs): - stdv = 1.0 / math.sqrt(hidden_size) + stdv = 1.0 / math.sqrt(hidden_size) - def embedding_layer(input, table_name, emb_dim, initializer_instance=None): + def embedding_layer(input, table_name, emb_dim, initializer_instance=None): emb = fluid.embedding( input=input, size=[items_num, emb_dim], @@ -97,10 +98,10 @@ class Model(ModelBase): name=table_name, initializer=initializer_instance), ) - return emb - - sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) - items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer) + return emb + + sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) + items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer) pre_state = items_emb for i in range(step): pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) @@ -113,7 +114,7 @@ class Model(ModelBase): param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, uniq_max, h] + low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", @@ -123,13 +124,13 @@ class Model(ModelBase): param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, uniq_max, h] - - state_adj_in = layers.matmul(self.adj_in, state_in) #[batch_size, uniq_max, h] - state_adj_out = layers.matmul(self.adj_out, state_out) #[batch_size, uniq_max, h] - + low=-stdv, high=stdv))) # [batch_size, uniq_max, h] + + state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h] + state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h] + gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) - + gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc( input=gru_input, @@ -140,11 +141,11 @@ class Model(ModelBase): input=gru_fc, hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) - + final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) seq = layers.gather_nd(final_state, self.seq_index) last = layers.gather_nd(final_state, self.last_index) - + seq_fc = layers.fc( input=seq, name="seq_fc", @@ -154,7 +155,7 @@ class Model(ModelBase): num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, seq_max, h] + low=-stdv, high=stdv))) # [batch_size, seq_max, h] last_fc = layers.fc( input=last, name="last_fc", @@ -164,22 +165,22 @@ class Model(ModelBase): num_flatten_dims=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[bathc_size, h] - + low=-stdv, high=stdv))) # [bathc_size, h] + seq_fc_t = layers.transpose( - seq_fc, perm=[1, 0, 2]) #[seq_max, batch_size, h] + seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h] add = layers.elementwise_add( - seq_fc_t, last_fc) #[seq_max, batch_size, h] + seq_fc_t, last_fc) # [seq_max, batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', - default_initializer=fluid.initializer.Constant(value=0.0)) #[h] - add = layers.elementwise_add(add, b) #[seq_max, batch_size, h] - - add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h] + default_initializer=fluid.initializer.Constant(value=0.0)) # [h] + add = layers.elementwise_add(add, b) # [seq_max, batch_size, h] + + add_sigmoid = layers.sigmoid(add) # [seq_max, batch_size, h] add_sigmoid = layers.transpose( - add_sigmoid, perm=[1, 0, 2]) #[batch_size, seq_max, h] - + add_sigmoid, perm=[1, 0, 2]) # [batch_size, seq_max, h] + weight = layers.fc( input=add_sigmoid, name="weight_fc", @@ -189,13 +190,13 @@ class Model(ModelBase): bias_attr=False, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, seq_max, 1] + low=-stdv, high=stdv))) # [batch_size, seq_max, 1] weight *= self.mask - weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h] - global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h] - + weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h] + global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h] + final_attention = layers.concat( - [global_attention, last], axis=1) #[batch_size, 2*h] + [global_attention, last], axis=1) # [batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", @@ -203,14 +204,14 @@ class Model(ModelBase): bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, h] - - # all_vocab = layers.create_global_var( - # shape=[items_num - 1], - # value=0, - # dtype="int64", - # persistable=True, - # name="all_vocab") + low=-stdv, high=stdv))) # [batch_size, h] + + # all_vocab = layers.create_global_var( + # shape=[items_num - 1], + # value=0, + # dtype="int64", + # persistable=True, + # name="all_vocab") all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32') all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64') @@ -220,13 +221,13 @@ class Model(ModelBase): name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), - size=[items_num, hidden_size]) #[all_vocab, h] - + size=[items_num, hidden_size]) # [all_vocab, h] + logits = layers.matmul( x=final_attention_fc, y=all_emb, - transpose_y=True) #[batch_size, all_vocab] + transpose_y=True) # [batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( - logits=logits, label=self.label) #[batch_size, 1] + logits=logits, label=self.label) # [batch_size, 1] self.loss = layers.reduce_mean(softmax) # [1] self.acc = layers.accuracy(input=logits, label=self.label, k=20) @@ -249,7 +250,7 @@ class Model(ModelBase): decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, self._namespace) decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, self._namespace) l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace) - optimizer = fluid.optimizer.Adam( + optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=decay_steps * step_per_epoch, @@ -257,18 +258,18 @@ class Model(ModelBase): regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=l2)) - return optimizer + return optimizer def infer_input(self): self._reader_namespace = "evaluate.reader" res = self.input(self.evaluate_batch_size) - self._infer_data_var = res + self._infer_data_var = res self._infer_data_loader = fluid.io.DataLoader.from_generator( feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + def infer_net(self): - self.infer_input() - self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size) + self.infer_input() + self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size) self._infer_results['acc'] = self.acc - self._infer_results['loss'] = self.loss + self._infer_results['loss'] = self.loss diff --git a/models/recall/gnn/reader.py b/models/recall/gnn/reader.py index a2b5eac10015c35fe124a17ba9aca8a52543f8a8..cffb45115ed6a3dd6232b34db8758ad6a20447e2 100755 --- a/models/recall/gnn/reader.py +++ b/models/recall/gnn/reader.py @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io + import copy import random + +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs @@ -22,17 +24,17 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): self.batch_size = envs.get_global_env("batch_size", None, "train.reader") - + self.input = [] self.length = None def base_read(self, files): res = [] for f in files: - with open(f, "r") as fin: + with open(f, "r") as fin: for line in fin: - line = line.strip().split('\t') - res.append(tuple([map(int, line[0].split(',')), int(line[1])])) + line = line.strip().split('\t') + res.append(tuple([map(int, line[0].split(',')), int(line[1])])) return res def make_data(self, cur_batch, batch_size): @@ -120,10 +122,11 @@ class TrainReader(Reader): else: # Due to fixed batch_size, discard the remaining ins return - #cur_batch = remain_data[i:] - #yield self.make_data(cur_batch, group_remain % batch_size) + # cur_batch = remain_data[i:] + # yield self.make_data(cur_batch, group_remain % batch_size) + return _reader - + def generate_batch_from_trainfiles(self, files): self.input = self.base_read(files) self.length = len(self.input) @@ -132,4 +135,5 @@ class TrainReader(Reader): def generate_sample(self, line): def data_iter(): yield [] + return data_iter diff --git a/models/recall/gru4rec/hdfs.log b/models/recall/gru4rec/hdfs.log deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/models/recall/gru4rec/model.py b/models/recall/gru4rec/model.py index d77f3c254f4f20b12854dbf05af35c64613c4b84..b79c7642201990efae56a640954154404bf2e606 100644 --- a/models/recall/gru4rec/model.py +++ b/models/recall/gru4rec/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -87,10 +86,8 @@ class Model(ModelBase): self._metrics["cost"] = avg_cost self._metrics["acc"] = acc - def train_net(self): self.all_vocab_network() - def infer_net(self): self.all_vocab_network(is_infer=True) diff --git a/models/recall/gru4rec/rsc15_infer_reader.py b/models/recall/gru4rec/rsc15_infer_reader.py index 829726e3d8861292266c25dfba4298a1ee2f502a..b58532a471f4b70eedfebeeadb35df20b4c40e72 100644 --- a/models/recall/gru4rec/rsc15_infer_reader.py +++ b/models/recall/gru4rec/rsc15_infer_reader.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class EvaluateReader(Reader): diff --git a/models/recall/gru4rec/rsc15_reader.py b/models/recall/gru4rec/rsc15_reader.py index eeb2441dd697bce4b3d3dc51bc64f1e2bfc0d2ab..4fe9433a65e1ceec508891eecfaaa5e464bc9e24 100644 --- a/models/recall/gru4rec/rsc15_reader.py +++ b/models/recall/gru4rec/rsc15_reader.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class TrainReader(Reader): diff --git a/models/recall/ssr/model.py b/models/recall/ssr/model.py index 3f93c8a12d6432634d1a4bf5d5835cef91cd70b7..2c4b7f190088cd7681720f83e3a53730b790d462 100644 --- a/models/recall/ssr/model.py +++ b/models/recall/ssr/model.py @@ -12,15 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid - -from paddlerec.core.utils import envs -from paddlerec.core.model import Model as ModelBase import paddle.fluid.layers.tensor as tensor -import paddle.fluid.layers.io as io import paddle.fluid.layers.control_flow as cf +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase class BowEncoder(object): @@ -54,6 +51,7 @@ class GrnnEncoder(object): bias_attr=self.param_name + ".bias") return fluid.layers.sequence_pool(input=gru_h, pool_type='max') + class PairwiseHingeLoss(object): def __init__(self, margin=0.8): self.margin = margin @@ -70,6 +68,7 @@ class PairwiseHingeLoss(object): loss_part2) return loss_part3 + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) @@ -80,7 +79,6 @@ class Model(ModelBase): return correct def train(self): - vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) @@ -124,16 +122,14 @@ class Model(ModelBase): hinge_loss = self.pairwise_hinge_loss.forward(cos_pos, cos_neg) avg_cost = fluid.layers.mean(hinge_loss) correct = self.get_correct(cos_neg, cos_pos) - + self._cost = avg_cost self._metrics["correct"] = correct self._metrics["hinge_loss"] = hinge_loss - def train_net(self): self.train() - def infer(self): vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) @@ -146,7 +142,7 @@ class Model(ModelBase): pos_label = fluid.data(name="pos_label", shape=[None, 1], dtype="int64") self._infer_data_var = [user_data, all_item_data, pos_label] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) user_emb = fluid.embedding( input=user_data, size=[vocab_size, emb_dim], param_attr="emb.item") @@ -173,6 +169,5 @@ class Model(ModelBase): self._infer_results['recall20'] = acc - def infer_net(self): self.infer() diff --git a/models/recall/ssr/ssr_infer_reader.py b/models/recall/ssr/ssr_infer_reader.py index b9e5f7263162551a7ae8cecdf86dc9b5423b53b0..18f3fc2f37236907801fb00047fd3b6da5b5fa8c 100644 --- a/models/recall/ssr/ssr_infer_reader.py +++ b/models/recall/ssr/ssr_infer_reader.py @@ -11,19 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs -import random -import numpy as np class EvaluateReader(Reader): def init(self): self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters") - def generate_sample(self, line): """ Read the data line by line and process it as a dictionary @@ -39,6 +39,6 @@ class EvaluateReader(Reader): src = conv_ids[:boundary] pos_tgt = [conv_ids[boundary]] feature_name = ["user", "all_item", "p_item"] - yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()]+ [pos_tgt]) + yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt]) return reader diff --git a/models/recall/ssr/ssr_reader.py b/models/recall/ssr/ssr_reader.py index e2a352650cb6d5bb09fda78d27c3925f5ce670a5..d2d35458d867bd560e7e0b751f61de83d0f822b6 100644 --- a/models/recall/ssr/ssr_reader.py +++ b/models/recall/ssr/ssr_reader.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs import random +from paddlerec.core.reader import Reader + class TrainReader(Reader): def init(self): @@ -25,7 +26,6 @@ class TrainReader(Reader): def sample_neg_from_seq(self, seq): return seq[random.randint(0, len(seq) - 1)] - def generate_sample(self, line): """ Read the data line by line and process it as a dictionary diff --git a/models/recall/word2vec/model.py b/models/recall/word2vec/model.py index ea8e0f5e81e5554668ba9ad81cc7f80718a0b3b9..bf09a04648a71a6618b99ef7de7d7244aaecbdba 100755 --- a/models/recall/word2vec/model.py +++ b/models/recall/word2vec/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import numpy as np import paddle.fluid as fluid diff --git a/models/recall/word2vec/prepare_data.sh b/models/recall/word2vec/prepare_data.sh index 743ae99871ba1cc2d7309cda11117446613eb0e5..8b78eeedd94f088e206e35729a6b35d349b99039 100755 --- a/models/recall/word2vec/prepare_data.sh +++ b/models/recall/word2vec/prepare_data.sh @@ -1,5 +1,20 @@ #! /bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + # download train_data mkdir raw_data wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar diff --git a/models/recall/word2vec/preprocess.py b/models/recall/word2vec/preprocess.py index 592f9755b761cf3d98c91f982de18de403c74298..9c9934e40589bdc700b7df5dc432d9b6dc92a8cc 100755 --- a/models/recall/word2vec/preprocess.py +++ b/models/recall/word2vec/preprocess.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io +import math import os import random import re import six + import argparse -import io -import math + prog = re.compile("[^a-z ]", flags=0) @@ -73,7 +75,7 @@ def parse_args(): def text_strip(text): - #English Preprocess Rule + # English Preprocess Rule return prog.sub("", text.lower()) @@ -115,7 +117,7 @@ def filter_corpus(args): word_all_count = 0 id_counts = [] word_id = 0 - #read dict + # read dict with io.open(args.dict_path, 'r', encoding='utf-8') as f: for line in f: word, count = line.split()[0], int(line.split()[1]) @@ -125,13 +127,13 @@ def filter_corpus(args): id_counts.append(count) word_all_count += count - #write word2id file + # write word2id file print("write word2id file to : " + args.dict_path + "_word_to_id_") with io.open( args.dict_path + "_word_to_id_", 'w+', encoding='utf-8') as fid: for k, v in word_to_id_.items(): fid.write(k + " " + str(v) + '\n') - #filter corpus and convert id + # filter corpus and convert id if not os.path.exists(args.output_corpus_dir): os.makedirs(args.output_corpus_dir) for file in os.listdir(args.input_corpus_dir): @@ -152,9 +154,9 @@ def filter_corpus(args): count_w = id_counts[idx] corpus_size = word_all_count keep_prob = ( - math.sqrt(count_w / - (args.downsample * corpus_size)) + 1 - ) * (args.downsample * corpus_size) / count_w + math.sqrt(count_w / + (args.downsample * corpus_size)) + 1 + ) * (args.downsample * corpus_size) / count_w r_value = random.random() if r_value > keep_prob: continue @@ -200,7 +202,7 @@ def build_dict(args): for item in item_to_remove: unk_sum += word_count[item] del word_count[item] - #sort by count + # sort by count word_count[native_to_unicode('')] = unk_sum word_count = sorted( word_count.items(), key=lambda word_count: -word_count[1]) @@ -222,17 +224,18 @@ def data_split(args): for file_ in files: with open(os.path.join(raw_data_dir, file_), 'r') as f: contents.extend(f.readlines()) - + num = int(args.file_nums) lines_per_file = len(contents) / num print("contents: ", str(len(contents))) print("lines_per_file: ", str(lines_per_file)) - - for i in range(1, num+1): + + for i in range(1, num + 1): with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout: - data = contents[(i-1)*lines_per_file:min(i*lines_per_file,len(contents))] + data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))] for line in data: - fout.write(line) + fout.write(line) + if __name__ == "__main__": args = parse_args() diff --git a/models/recall/word2vec/w2v_evaluate_reader.py b/models/recall/word2vec/w2v_evaluate_reader.py index 7ab498869e9826ff82ead4be20f5a3abdc96c7bd..04be9d41b2cd1ec51768696817a57c38dd958a44 100755 --- a/models/recall/word2vec/w2v_evaluate_reader.py +++ b/models/recall/word2vec/w2v_evaluate_reader.py @@ -11,16 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np + import io + import six + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader") + dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader") self.word_to_id = dict() self.id_to_word = dict() with io.open(dict_path, 'r', encoding='utf-8') as f: @@ -46,19 +48,16 @@ class EvaluateReader(Reader): if isinstance(s, str): return True return False - - + def _to_unicode(self, s, ignore_errors=False): if self._is_unicode(s): return s error_mode = "ignore" if ignore_errors else "strict" return s.decode("utf-8", errors=error_mode) - - + def strip_lines(self, line, vocab): return self._replace_oov(vocab, self.native_to_unicode(line)) - - + def _replace_oov(self, original_vocab, line): """Replace out-of-vocab words with "". This maintains compatibility with published results. @@ -76,5 +75,7 @@ class EvaluateReader(Reader): def reader(): features = self.strip_lines(line.lower(), self.word_to_id) features = features.split() - yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), ('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])] + yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), + ('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])] + return reader diff --git a/models/recall/word2vec/w2v_reader.py b/models/recall/word2vec/w2v_reader.py index 609a516a24ada32908f1748d6c4598be3d61accc..88e52b47692778feef8396dd037448a8053aa958 100755 --- a/models/recall/word2vec/w2v_reader.py +++ b/models/recall/word2vec/w2v_reader.py @@ -11,8 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np + import io + +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs @@ -37,7 +40,7 @@ class NumpyRandomInt(object): class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader") + dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader") self.window_size = envs.get_global_env("hyper_parameters.window_size", None, "train.model") self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, "train.model") self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch", None, "train.model") @@ -72,7 +75,7 @@ class TrainReader(Reader): start_point = 0 end_point = idx + target_window targets = words[start_point:idx] + words[idx + 1:end_point + 1] - return targets + return targets def generate_sample(self, line): def reader(): @@ -84,7 +87,7 @@ class TrainReader(Reader): output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])] if not self.with_shuffle_batch: neg_array = self.cs.searchsorted(np.random.sample(self.neg_num)) - output += [('neg_label', [int(str(i)) for i in neg_array ])] + output += [('neg_label', [int(str(i)) for i in neg_array])] yield output - return reader + return reader diff --git a/models/treebased/tdm/model.py b/models/treebased/tdm/model.py index 0fc1adf9cd04286cdb31cc3ba08715574dbee18b..fa5f225f68068f826df6fc9ef0c7c9d35dbd9b89 100755 --- a/models/treebased/tdm/model.py +++ b/models/treebased/tdm/model.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -134,7 +134,7 @@ class Model(ModelBase): sample_nodes_emb = [ fluid.layers.reshape(sample_nodes_emb[i], [-1, self.neg_sampling_list[i] + - self.output_positive, self.node_emb_size] + self.output_positive, self.node_emb_size] ) for i in range(self.max_layers) ] @@ -229,7 +229,7 @@ class Model(ModelBase): act=self.act, param_attr=fluid.ParamAttr( name="trans.layer_fc.weight." + str(i)), - bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias."+str(i)), + bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)), ) for i in range(self.max_layers) ] @@ -268,8 +268,8 @@ class Model(ModelBase): num_flatten_dims=2, act=self.act, param_attr=fluid.ParamAttr( - name="cls.concat_fc.weight."+str(i)), - bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(i)) + name="cls.concat_fc.weight." + str(i)), + bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)) ) for i in range(self.max_layers) ] @@ -348,7 +348,7 @@ class Model(ModelBase): current_layer_node_num = self.first_layer_node.shape[1] else: current_layer_node_num = current_layer_node.shape[1] * \ - current_layer_node.shape[2] + current_layer_node.shape[2] current_layer_node = fluid.layers.reshape( current_layer_node, [-1, current_layer_node_num]) @@ -458,7 +458,7 @@ class Model(ModelBase): param_attr=fluid.ParamAttr( name="trans.layer_fc.weight." + str(layer_idx)), bias_attr=fluid.ParamAttr( - name="trans.layer_fc.bias."+str(layer_idx)), + name="trans.layer_fc.bias." + str(layer_idx)), ) return input_layer_fc_out @@ -479,6 +479,6 @@ class Model(ModelBase): num_flatten_dims=2, act=self.act, param_attr=fluid.ParamAttr( - name="cls.concat_fc.weight."+str(layer_idx)), - bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(layer_idx))) + name="cls.concat_fc.weight." + str(layer_idx)), + bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx))) return hidden_states_fc diff --git a/models/treebased/tdm/tdm_evaluate_reader.py b/models/treebased/tdm/tdm_evaluate_reader.py index 844e441fbda303ea4a5ab3c0f549711579dbf5d5..4e3b64770b42c05726dd3c90466d77e422e00902 100644 --- a/models/treebased/tdm/tdm_evaluate_reader.py +++ b/models/treebased/tdm/tdm_evaluate_reader.py @@ -28,6 +28,7 @@ class EvaluateReader(Reader): """ Read the data line by line and process it as a dictionary """ + def reader(): """ This function needs to be implemented by the user, based on data format diff --git a/models/treebased/tdm/tdm_reader.py b/models/treebased/tdm/tdm_reader.py index 0b8ada9ea4d695aafd38c1e87831c9939e483618..709900649a03c3439cbf474781a5c0ae7b087dd7 100755 --- a/models/treebased/tdm/tdm_reader.py +++ b/models/treebased/tdm/tdm_reader.py @@ -28,6 +28,7 @@ class TrainReader(Reader): """ Read the data line by line and process it as a dictionary """ + def reader(): """ This function needs to be implemented by the user, based on data format diff --git a/run.py b/run.py index 6c0e06f77a644a1166ee8c318e057ade2d060643..56999935f21bc1de2b2bc7b4a080da023559174a 100755 --- a/run.py +++ b/run.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import argparse import os import subprocess -import tempfile +import argparse +import tempfile import yaml from paddlerec.core.factory import TrainerFactory diff --git a/setup.py b/setup.py index f3e97f97b1b51c6344275a4aa96583fd79004efe..c655c37576e310fac825bd1cc01dfca5d051d18c 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,27 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ setup for paddle-rec. """ + import os + from setuptools import setup, find_packages -import tempfile import shutil +import tempfile + requires = [ "paddlepaddle == 1.7.2", @@ -19,7 +36,7 @@ about["__author__"] = "paddle-dev" about["__author_email__"] = "paddle-dev@baidu.com" about["__url__"] = "https://github.com/PaddlePaddle/PaddleRec" -readme = "..." +readme = "" def run_cmd(command): diff --git a/tools/tools.py b/tools/tools.py index ec85326cf71374aa4139a58ffc0cf05535543949..8508a790bc8012954b53cc43b088d2e50655647d 100644 --- a/tools/tools.py +++ b/tools/tools.py @@ -12,15 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import os -import time +import platform +import sys import shutil +import time + import requests -import sys import tarfile import zipfile -import platform -import functools lasttime = time.time() FLUSH_INTERVAL = 0.1