From 3ab39705eac27b0011956bb8003479db3bd50b3e Mon Sep 17 00:00:00 2001 From: An Improved PeleeNet Algorithm with Feature Pyramid Networks for Image Detection <39549453+Baibaifan@users.noreply.github.com> Date: Mon, 29 Mar 2021 20:34:16 +0800 Subject: [PATCH] adapter npu (#31926) Co-authored-by: baiyangfan --- paddle/fluid/framework/device_worker.h | 2 +- paddle/fluid/framework/device_worker_factory.cc | 2 +- paddle/fluid/framework/pipeline_trainer.cc | 6 +++++- paddle/fluid/framework/section_worker.cc | 2 +- paddle/fluid/framework/trainer.h | 2 +- paddle/fluid/operators/cast_op_npu.cc | 1 + paddle/fluid/operators/expand_op_npu.cc | 1 + paddle/fluid/operators/lookup_table_v2_op_npu.cc | 2 ++ paddle/fluid/operators/slice_op_npu.cc | 2 ++ .../fleet/meta_optimizers/sharding_optimizer.py | 4 +++- python/paddle/fluid/device_worker.py | 2 +- python/paddle/fluid/optimizer.py | 10 ++++++++-- 12 files changed, 27 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index ae0a6a2bc4..20223e503a 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -634,7 +634,7 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) +#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index 109b520f5a..999502d811 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -76,7 +76,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) +#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index dbcc993aee..ab14dcf17b 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) +#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL) #include #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" @@ -35,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); +#if (defined PADDLE_WITH_NCCL) place_ = platform::CUDAPlace(place_id); +#elif (defined WITH_ASCEND_CL) + place_ = platform::NPUPlace(place_id); +#endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); auto this_worker = diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 87bd2ebad2..3aec2defb3 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) +#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL) #include #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index ca57a89ca9..4d90fb8376 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -320,7 +320,7 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) +#if (defined PADDLE_WITH_NCCL) || (defined WITH_ASCEND_CL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc index 5cfb152684..282ac6c1f6 100755 --- a/paddle/fluid/operators/cast_op_npu.cc +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -83,6 +83,7 @@ REGISTER_OP_NPU_KERNEL( ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, + ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index 1446637da6..3c06008d00 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -76,6 +76,7 @@ class ExpandNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( expand, ops::ExpandNPUKernel, + ops::ExpandNPUKernel, ops::ExpandNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index fab2d7f7aa..4516aa38fb 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -82,9 +82,11 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( lookup_table_v2, ops::LookupTableV2NPUKernel, + ops::LookupTableV2NPUKernel, ops::LookupTableV2NPUKernel); REGISTER_OP_NPU_KERNEL( lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel, + ops::LookupTableV2GradNPUKernel, ops::LookupTableV2GradNPUKernel); diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index e5e0dafdae..9974536da9 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -124,11 +124,13 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( slice, ops::SliceNPUKernel, + ops::SliceNPUKernel, ops::SliceNPUKernel); REGISTER_OP_NPU_KERNEL( slice_grad, ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel, ops::SliceGradNPUKernel); diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 97febe8db2..000ce20f6d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -103,6 +103,8 @@ class ShardingOptimizer(MetaOptimizerBase): self.pp_bz = self.user_defined_strategy.sharding_configs["pp_bz"] self.pp_allreduce_in_optimize = self.user_defined_strategy.sharding_configs[ "pp_allreduce_in_optimize"] + self.optimize_offload = self.user_defined_strategy.sharding_configs[ + "optimize_offload"] if self.inner_opt is None: raise ValueError( @@ -238,7 +240,7 @@ class ShardingOptimizer(MetaOptimizerBase): #check_allreduce_sum(main_block, self._shard, self.sharding_ring_id, # self.dp_ring_id) #check_allreduce_sum(main_block, self._shard, self.dp_ring_id) - self._wait() + # self._wait() return optimize_ops, params_grads def _set_up(self, params_grads): diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index 3c5906ceb9..d78eb61a80 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -424,7 +424,7 @@ class Section(DeviceWorker): # cfg.program_desc.CopyFrom(program.program._get_desc()) place = pipeline_opt["place"] place_id = pipeline_opt["place_id"] - assert isinstance(place, core.CUDAPlace) + # assert isinstance(place, core.CUDAPlace) cfg.place = cfg.CUDAPlace cfg.place_id = place_id diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 2b70b670a4..b12d970cc2 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -5272,7 +5272,10 @@ class PipelineOptimizer(object): place_list = [] for dev in device_list: dev_index = int(dev.split(":")[1]) - place_list.append(core.CUDAPlace(dev_index % 8)) + if core.is_compiled_with_cuda(): + place_list.append(core.CUDAPlace(dev_index % 1)) + elif core.is_compiled_with_npu(): + place_list.append(core.NPUPlace(dev_index % 1)) # Step6: Split startup program new_startup_program = self._split_startup_program(startup_program, @@ -5295,7 +5298,10 @@ class PipelineOptimizer(object): self._accumulate_gradients(real_block) real_block._sync_with_cpp() - place_id = int(os.getenv("FLAGS_selected_gpus", "0")) + if core.is_compiled_with_cuda(): + place_id = int(os.getenv("FLAGS_selected_gpus", "0")) + elif core.is_compiled_with_npu(): + place_id = int(os.getenv("FLAGS_selected_npus", "0")) main_program._pipeline_opt = { "trainer": "PipelineTrainer", "device_worker": "Section", -- GitLab