solve hccl communicate conflict (#32447)

solve hccl communicate conflict (#32447)

solve hccl communicate conflict (#32447)
0e74eea2 · Baibaifan · GitHub · 2b108a04 · 0e74eea2 · 0e74eea2
17 changed file
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -638,7 +638,8 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 class SectionWorker : public DeviceWorker {
 public:
  SectionWorker() {}

--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -34,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
  ParseDumpConfig(trainer_desc);
  const auto& section_config = section_params.section_config();
  int place_id = section_config.place_id();
+#if (defined PADDLE_WITH_NCCL)
  place_ = platform::CUDAPlace(place_id);
+#elif (defined WITH_ASCEND_CL)
+  place_ = platform::NPUPlace(place_id);
+#endif
  worker_ = DeviceWorkerFactory::CreateDeviceWorker(
      trainer_desc.device_worker_name());
  auto this_worker =

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 #include <float.h>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"

--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -332,7 +332,8 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}

--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -92,6 +92,7 @@ REGISTER_OP_NPU_KERNEL(
    cast, ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,

--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -79,6 +79,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
    expand, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
                         paddle::platform::float16>);


--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -86,9 +86,11 @@ namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
    lookup_table_v2,
    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
                                paddle::platform::float16>);

 REGISTER_OP_NPU_KERNEL(
    lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel<float>,
+    ops::LookupTableV2GradNPUKernel<int>,
    ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -124,11 +124,13 @@ namespace ops = paddle::operators;

 REGISTER_OP_NPU_KERNEL(
    slice, ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, int>,
    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext,
                        paddle::platform::float16>);

 REGISTER_OP_NPU_KERNEL(
    slice_grad,
    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext,
                            paddle::platform::float16>);
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
+import os

 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
@@ -77,6 +78,7 @@ class CollectiveHelper(object):
        nranks = len(endpoints)
        other_endpoints = endpoints[:]
        other_endpoints.remove(current_endpoint)
+
        if rank == 0 and wait_port:
            wait_server_ready(other_endpoints)


--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -13,6 +13,7 @@

 from __future__ import print_function
 from __future__ import division
+import os

 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -365,8 +365,8 @@ class ShardingOptimizer(MetaOptimizerBase):
                  'w') as f:
            f.writelines(str(main_block.program))

-        self._wait()
-
+        if core.is_compiled_with_cuda():
+            self._wait()
        return optimize_ops, params_grads

    def _init_comm(self):

--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -433,7 +433,10 @@ class Section(DeviceWorker):
        # cfg.program_desc.CopyFrom(program.program._get_desc())
        place = pipeline_opt["place"]
        place_id = pipeline_opt["place_id"]
-        assert isinstance(place, core.CUDAPlace)
+        if core.is_compiled_with_cuda():
+            assert isinstance(place, core.CUDAPlace)
+        elif core.is_compiled_with_npu():
+            assert isinstance(place, core.NPUPlace)
        cfg.place = cfg.CUDAPlace
        cfg.place_id = place_id


--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1451,8 +1451,12 @@ class Executor(object):
            for var in program.global_block().vars.values():
                if var.is_data:
                    data_vars.append(var)
-            dataset = paddle.fluid.DatasetFactory().create_dataset(
-                'FileInstantDataset')
+            if core.is_compiled_with_npu():
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'InMemoryDataset')
+            else:
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'FileInstantDataset')
            dataset.set_batch_size(1)
            dataset.set_thread(1)
            dataset.set_filelist(['None'])

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4818,7 +4818,10 @@ class PipelineOptimizer(object):
        place_list = []
        for dev in device_list:
            dev_index = int(dev.split(":")[1])
-            place_list.append(core.CUDAPlace(0))
+            if core.is_compiled_with_cuda():
+                place_list.append(core.CUDAPlace(dev_index % 1))
+            elif core.is_compiled_with_npu():
+                place_list.append(core.NPUPlace(dev_index % 1))

        # Step6: Split startup program
        new_startup_program = self._split_startup_program(startup_program,
@@ -4837,7 +4840,10 @@ class PipelineOptimizer(object):
            self._accumulate_gradients(real_block)
            real_block._sync_with_cpp()

-        place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        if core.is_compiled_with_cuda():
+            place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        elif core.is_compiled_with_npu():
+            place_id = int(os.getenv("FLAGS_selected_npus", "0"))
        main_program._pipeline_opt = {
            "trainer": "PipelineTrainer",
            "device_worker": "Section",

--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import sys
 import math
 from functools import reduce
+import os

 import collections
 import six
@@ -101,6 +102,8 @@ class Collective(object):
        nranks = len(endpoints)
        other_endpoints = endpoints[:]
        other_endpoints.remove(current_endpoint)
+        block = program.global_block()
+
        if rank == 0 and wait_port:
            wait_server_ready(other_endpoints)


--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -133,9 +133,9 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
        return
    other_endpoints = endpoints[:]
    other_endpoints.remove(current_endpoint)
+    block = program.global_block()
    if rank == 0 and wait_port:
        wait_server_ready(other_endpoints)
-    block = program.global_block()
    if core.is_compiled_with_cuda():
        nccl_id_var = block.create_var(
            name=fluid.unique_name.generate('nccl_id'),