diff --git a/fleetrec/core/trainer/details/local_engine.py b/fleetrec/core/trainer/details/local_engine.py index 9a1c4ba0abfdab34ae96783515c197ebc7a1ccc3..1610c530fddffb4bd0e9c74dacf516591aa1e099 100644 --- a/fleetrec/core/trainer/details/local_engine.py +++ b/fleetrec/core/trainer/details/local_engine.py @@ -38,7 +38,7 @@ def start_procs(args, yaml): user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")] user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")] - factory = "fleetrec.trainer.factory" + factory = "fleetrec.core.factory" cmd = [sys.executable, "-u", "-m", factory, yaml] for i in range(server_num): @@ -91,11 +91,11 @@ def start_procs(args, yaml): procs[i].terminate() print("all parameter server are killed", file=sys.stderr) -class Launch(): + +class Launch: def __init__(self, envs, trainer): self.envs = envs self.trainer = trainer def run(self): start_procs(self.envs, self.trainer) - diff --git a/fleetrec/examples/built_in/cluster_training_local.yaml b/fleetrec/examples/built_in/cluster_training_local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bf878e8c57cfa88f035ddd7c5d52904ad739f39c --- /dev/null +++ b/fleetrec/examples/built_in/cluster_training_local.yaml @@ -0,0 +1,10 @@ + +trainer: "LocalClusterTraining" + +pserver_num: 2 +trainer_num: 2 +start_port: 36001 +log_dirname: "logs" + +strategy: + mode: "async" diff --git a/fleetrec/examples/built_in/cluster_training_mpi.yaml b/fleetrec/examples/built_in/cluster_training_mpi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..19974f40ad5256b65bbf3c6adcb1bf6217756bd1 --- /dev/null +++ b/fleetrec/examples/built_in/cluster_training_mpi.yaml @@ -0,0 +1,10 @@ + +trainer: "MPIClusterTraining" + +pserver_num: 2 +trainer_num: 2 +start_port: 36001 +log_dirname: "logs" + +strategy: + mode: "async" diff --git a/fleetrec/examples/built_in/cluster_training_user_define.yaml b/fleetrec/examples/built_in/cluster_training_user_define.yaml new file mode 100644 index 0000000000000000000000000000000000000000..973bada54bea6c2c0e929d0dd7182a2003c84712 --- /dev/null +++ b/fleetrec/examples/built_in/cluster_training_user_define.yaml @@ -0,0 +1,2 @@ +trainer: "UserDefineTrainer" +location: "/root/FleetRec/fleetrec/examples/user_define_trainer.py" diff --git a/fleetrec/examples/ctr-dnn_train_single.yaml b/fleetrec/examples/built_in/ctr-dnn_train.yaml similarity index 88% rename from fleetrec/examples/ctr-dnn_train_single.yaml rename to fleetrec/examples/built_in/ctr-dnn_train.yaml index 50d25cad28f4b1265e27b507615aa3b5ebb3f23d..5c4af64a840e62c26a3be1b02f57b4489333536b 100644 --- a/fleetrec/examples/ctr-dnn_train_single.yaml +++ b/fleetrec/examples/built_in/ctr-dnn_train.yaml @@ -15,9 +15,10 @@ train: threads: 12 epochs: 10 - trainer: "SingleTraining" + trainer: "single_training.yaml" reader: + mode: "dataset" batch_size: 2 class: "fleetrec.models.ctr_dnn.data_generator" train_data_path: "/root/FleetRec/fleetrec/models/ctr_dnn/data/train/" @@ -29,7 +30,7 @@ train: sparse_feature_number: 1000001 sparse_feature_dim: 8 dense_input_dim: 13 - fc_sizes: [512, 256, 128, 32] + fc_sizes: [512, 256, 128, 32] learning_rate: 0.001 save: @@ -40,6 +41,8 @@ train: inference: dirname: "models_for_inference" epoch_interval: 4 + feed_varnames: ["C1", "C2", "C3"] + fetch_varnames: "predict" save_last: True evaluate: diff --git a/fleetrec/examples/train.py b/fleetrec/examples/built_in/run.py similarity index 94% rename from fleetrec/examples/train.py rename to fleetrec/examples/built_in/run.py index 7a72fb78e9653d03f459d35c9ad8f676462d1d51..a1c4281d248200f3ec21a5e8afabcb64a9d81c3a 100644 --- a/fleetrec/examples/train.py +++ b/fleetrec/examples/built_in/run.py @@ -14,7 +14,7 @@ import os -from fleetrec.trainer.factory import TrainerFactory +from fleetrec.core.factory import TrainerFactory if __name__ == "__main__": diff --git a/fleetrec/examples/built_in/single_training.yaml b/fleetrec/examples/built_in/single_training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8fb437aeee70c6fedab5484d4afbb0b5168676fa --- /dev/null +++ b/fleetrec/examples/built_in/single_training.yaml @@ -0,0 +1,2 @@ + +trainer: "SingleTraining" \ No newline at end of file diff --git a/fleetrec/examples/user_define_trainer.py b/fleetrec/examples/built_in/user_define_trainer.py similarity index 96% rename from fleetrec/examples/user_define_trainer.py rename to fleetrec/examples/built_in/user_define_trainer.py index 549d42b84ce209688fb597026786c79f50dad1f4..70841f9a3eb5a135aec3875a088d04f1a3e4cfd0 100644 --- a/fleetrec/examples/user_define_trainer.py +++ b/fleetrec/examples/built_in/user_define_trainer.py @@ -21,6 +21,7 @@ from fleetrec.utils import envs class UserDefineTrainer(TranspileTrainer): def __init__(self, config=None): TranspileTrainer.__init__(self, config) + print("this is a demo about how to use user define trainer in fleet-rec") def processor_register(self): self.regist_context_processor('uninit', self.instance) diff --git a/fleetrec/examples/ctr-dnn_train_cluster.yaml b/fleetrec/examples/ctr-dnn_train_cluster.yaml deleted file mode 100644 index f4c8f2b1fe44ba8fac0b7842d339c7ebbbef0c2b..0000000000000000000000000000000000000000 --- a/fleetrec/examples/ctr-dnn_train_cluster.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -train: - threads: 12 - epochs: 10 - trainer: "ClusterTraining" - container: "local" - - pserver_num: 2 - trainer_num: 2 - start_port: 36001 - log_dirname: "logs" - - strategy: - mode: "async" - - reader: - mode: "dataset" - batch_size: 2 - pipe_command: "python /paddle/eleps/fleetrec/models/ctr_dnn/dataset.py" - train_data_path: "/paddle/eleps/fleetrec/models/ctr_dnn/data/train" - - model: - models: "fleetrec.models.ctr_dnn.model" - hyper_parameters: - sparse_inputs_slots: 27 - sparse_feature_number: 1000001 - sparse_feature_dim: 8 - dense_input_dim: 13 - fc_sizes: [512, 256, 128, 32] - learning_rate: 0.001 - - save: - increment: - dirname: "models_for_increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "models_for_inference" - epoch_interval: 4 - feed_varnames: ["C1", "C2", "C3"] - fetch_varnames: "predict" - save_last: True - -evaluate: - batch_size: 32 - train_thread_num: 12 - reader: "reader.py" - - diff --git a/fleetrec/examples/ctr-dnn_train_userdefine.yaml b/fleetrec/examples/ctr-dnn_train_userdefine.yaml deleted file mode 100644 index 423efedcfc0467524fe73ac2de3aa1f1ff5c2a67..0000000000000000000000000000000000000000 --- a/fleetrec/examples/ctr-dnn_train_userdefine.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -train: - threads: 12 - epochs: 10 - trainer: "UserDefineTrainer" - location: "/root/FleetRec/fleetrec/examples/user_define_trainer.py" - - reader: - batch_size: 2 - class: "fleetrec.models.ctr_dnn.data_generator" - train_data_path: "/root/FleetRec/fleetrec/models/ctr_dnn/data/train/" - - model: - models: "fleetrec.models.ctr_dnn.model" - hyper_parameters: - sparse_inputs_slots: 27 - sparse_feature_number: 1000001 - sparse_feature_dim: 8 - dense_input_dim: 13 - fc_sizes: [512, 256, 128, 32] - learning_rate: 0.001 - - save: - increment: - dirname: "models_for_increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "models_for_inference" - epoch_interval: 4 - save_last: True - -evaluate: - batch_size: 32 - train_thread_num: 12 - reader: "reader.py" - -