diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 628b9f0d70f598f44dea313828772f283891c499..a49e492e48028b15d724cbdc7c1b5efbc809ddcf 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -638,7 +638,8 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index a539a5d5f96b52eea852bc39b0081ea92ccfffc1..5780a95343385e984dd4f1d15123b715c1822a9e 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 5968df548dfb0fb4f2b5e591b63bee78c4e080d3..3649e00e7c9d85314afd0dce1ece3182136d9845 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -34,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); +#if (defined PADDLE_WITH_NCCL) place_ = platform::CUDAPlace(place_id); +#elif (defined WITH_ASCEND_CL) + place_ = platform::NPUPlace(place_id); +#endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); auto this_worker = diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index e740771e5ca9fce832d86b48935493b9d334c6f3..7860b69313e7b2270722abdabe5e922e2fabeac8 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) #include #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 7efb89ad7d9d9c3144ac14ae80d01bffcbf4cb4f..01aa07e618464db05aa5c4bf322ec78aac110e1b 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -332,7 +332,8 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc index 8d34e0ba99c2c33449b66d03852012984fd5a602..0de0f5e4505795f69f1d80e2bbc1600250fc7391 100644 --- a/paddle/fluid/operators/cast_op_npu.cc +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -92,6 +92,7 @@ REGISTER_OP_NPU_KERNEL( cast, ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, + ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index 453a990efbded34d721ba9b511f8979959290577..bb3a6512d2c8ba3b5f0d643a5ae6d906a00717c3 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -79,6 +79,7 @@ class ExpandNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( expand, ops::ExpandNPUKernel, + ops::ExpandNPUKernel, ops::ExpandNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index f614d906baa75960f8f1c6652db2434717db396f..320b498156b3f8da237f98c8ac35e0d784d087c2 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -86,9 +86,11 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( lookup_table_v2, ops::LookupTableV2NPUKernel, + ops::LookupTableV2NPUKernel, ops::LookupTableV2NPUKernel); REGISTER_OP_NPU_KERNEL( lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel, + ops::LookupTableV2GradNPUKernel, ops::LookupTableV2GradNPUKernel); diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index e5e0dafdae0b15ecc43fe0603688f097659aefd9..9974536da9acb401a859c2c9f1d10d79eed680bb 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -124,11 +124,13 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( slice, ops::SliceNPUKernel, + ops::SliceNPUKernel, ops::SliceNPUKernel); REGISTER_OP_NPU_KERNEL( slice_grad, ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel, ops::SliceGradNPUKernel); diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py index 1b51d4f66f3c2e1abbc516873788da166b7ee7a5..9e2723dad729aac3ff7692f54c35bde4b4b1d6ba 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/common.py +++ b/python/paddle/distributed/fleet/meta_optimizers/common.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import os import paddle.fluid as fluid from paddle.fluid import core, unique_name @@ -77,6 +78,7 @@ class CollectiveHelper(object): nranks = len(endpoints) other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) + if rank == 0 and wait_port: wait_server_ready(other_endpoints) diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index ae2daa9b9d8592a5be8ea57919c267a0c2a669d1..1aa51a6671c17f39f7628d85e1c137f1ce9c517e 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -13,6 +13,7 @@ from __future__ import print_function from __future__ import division +import os import paddle.fluid as fluid from paddle.fluid import core, unique_name diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 2c4ad33c361e01abcca66c33008826a028f8c354..852421523b15b12875a672897a2d1b9c8f74a0a6 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -365,8 +365,8 @@ class ShardingOptimizer(MetaOptimizerBase): 'w') as f: f.writelines(str(main_block.program)) - self._wait() - + if core.is_compiled_with_cuda(): + self._wait() return optimize_ops, params_grads def _init_comm(self): diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index 7bcd10a726949b7054e35aab77f22779c91bf731..7fed27ee45978a5b34d1235999875312c292fcd1 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -433,7 +433,10 @@ class Section(DeviceWorker): # cfg.program_desc.CopyFrom(program.program._get_desc()) place = pipeline_opt["place"] place_id = pipeline_opt["place_id"] - assert isinstance(place, core.CUDAPlace) + if core.is_compiled_with_cuda(): + assert isinstance(place, core.CUDAPlace) + elif core.is_compiled_with_npu(): + assert isinstance(place, core.NPUPlace) cfg.place = cfg.CUDAPlace cfg.place_id = place_id diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 76bc68f24d2fefff97100afd0ebb681f876184de..62a9c42ee0a61c0b01d4562daca4b30e83f24792 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1451,8 +1451,12 @@ class Executor(object): for var in program.global_block().vars.values(): if var.is_data: data_vars.append(var) - dataset = paddle.fluid.DatasetFactory().create_dataset( - 'FileInstantDataset') + if core.is_compiled_with_npu(): + dataset = paddle.fluid.DatasetFactory().create_dataset( + 'InMemoryDataset') + else: + dataset = paddle.fluid.DatasetFactory().create_dataset( + 'FileInstantDataset') dataset.set_batch_size(1) dataset.set_thread(1) dataset.set_filelist(['None']) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e11224719823144fd95f78699d7ac79156bc7e76..21b4c429a66e9d87742a7873a2d737a9f15823bf 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4818,7 +4818,10 @@ class PipelineOptimizer(object): place_list = [] for dev in device_list: dev_index = int(dev.split(":")[1]) - place_list.append(core.CUDAPlace(0)) + if core.is_compiled_with_cuda(): + place_list.append(core.CUDAPlace(dev_index % 1)) + elif core.is_compiled_with_npu(): + place_list.append(core.NPUPlace(dev_index % 1)) # Step6: Split startup program new_startup_program = self._split_startup_program(startup_program, @@ -4837,7 +4840,10 @@ class PipelineOptimizer(object): self._accumulate_gradients(real_block) real_block._sync_with_cpp() - place_id = int(os.getenv("FLAGS_selected_gpus", "0")) + if core.is_compiled_with_cuda(): + place_id = int(os.getenv("FLAGS_selected_gpus", "0")) + elif core.is_compiled_with_npu(): + place_id = int(os.getenv("FLAGS_selected_npus", "0")) main_program._pipeline_opt = { "trainer": "PipelineTrainer", "device_worker": "Section", diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py index c8cb474343afd1a5616a3aca3a54f57a33d6811e..ef6975c3d241e5de0a4dab17e88ebf6896472f32 100644 --- a/python/paddle/fluid/transpiler/collective.py +++ b/python/paddle/fluid/transpiler/collective.py @@ -17,6 +17,7 @@ from __future__ import print_function import sys import math from functools import reduce +import os import collections import six @@ -101,6 +102,8 @@ class Collective(object): nranks = len(endpoints) other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) + block = program.global_block() + if rank == 0 and wait_port: wait_server_ready(other_endpoints) diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index cc4e7a8b319dd8960797f00f06e55c57494adf69..fa8bd600bb28d8e99f5fbeed54db754274e592bf 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -133,9 +133,9 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint, return other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) + block = program.global_block() if rank == 0 and wait_port: wait_server_ready(other_endpoints) - block = program.global_block() if core.is_compiled_with_cuda(): nccl_id_var = block.create_var( name=fluid.unique_name.generate('nccl_id'),