diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index e1ce705533ab4ba1c75d8f656683608365e97907..3d8a5ab21f00fcc4137d177b741023a827e325d7 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -33,6 +33,7 @@ if(NOT WIN32) endif() if(WITH_CNCL) cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) + cc_library(reducer SRCS reducer.cc DEPS layer) endif() if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) @@ -41,7 +42,7 @@ if(NOT WIN32) endif(NOT WIN32) if(WITH_GLOO) cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits) - if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) )) + if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) )) cc_library(reducer SRCS reducer.cc DEPS layer) endif() endif() diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 3a6365b2af21ae9012fe37293699caed9bb23855..fec9afbf3b403ca2fd45633326c7f7dec46e1243 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -31,7 +31,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) // div the nranks void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = @@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { #ifdef PADDLE_WITH_XPU_BKCL // TODO(liuyuhui) support xpu about div nranks in the future #endif + } else if (platform::is_mlu_place(tensor->place())) { + // TODO(zhangna) + VLOG(4) << "divnrank for mlu not support yet"; } } @@ -222,6 +225,56 @@ void SplitTensorsWithType( } #endif +#ifdef PADDLE_WITH_CNCL +// context is used to select the stream for concat +template <> +void ConcatTensorsWithType( + const platform::MLUDeviceContext &context, + const std::vector &dense_tensors_, + framework::Variable *p_dense_contents, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case framework::proto::VarType::FP32: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} + +// context is used to select the stream for split +template <> +void SplitTensorsWithType( + const platform::MLUDeviceContext &context, + framework::Variable *p_dense_contents, + std::vector *p_dense_tensors, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case framework::proto::VarType::FP32: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} +#endif + void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { @@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't concat npu grads since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + ConcatTensorsWithType( + static_cast(context), + dense_tensors_, &dense_contents_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat mlu grads since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { ConcatTensorsWithType( @@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split npu grad since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + SplitTensorsWithType( + static_cast(context), + &dense_contents_, &dense_tensors_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split mlu grad since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { SplitTensorsWithType( @@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui) support XPU set constant VLOG(3) << "XPU doesn't support set_constant"; } +#elif defined(PADDLE_WITH_CNCL) + if (platform::is_mlu_place(group_tensor.place())) { + // TODO(liuyuhui) support MLU set constant + VLOG(3) << "MLU doesn't support set_constant"; + } #else auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); if (HasGrad(var_index)) { @@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) { cv_.notify_all(); } }); -#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) +#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_CNCL) FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not compiled with BKCL or NCCL or GLOO.")); + "Not compiled with BKCL or NCCL or CNCL or GLOO.")); #endif } } diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index cca773b840c279f05cd6bcd0ed82fda7fdd55a25..9fac4b41cbde01f365dcc603844b06c473a58843 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -45,7 +45,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) template struct DivNRanksFunctor { diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index e4f1cfdb3baeed9b5945b7843b6593528df48c29..09de0106ed6190c5f627ba9fb7cc038593b5088a 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy) cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op) -if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL) +if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL) cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy) endif() diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc index 6c304278d21fde7af093b25cdd8f62a1d4528d31..5e674af1a08a87c11bfab1080be42e623661b38e 100644 --- a/paddle/fluid/imperative/tests/test_group.cc +++ b/paddle/fluid/imperative/tests/test_group.cc @@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) { value.push_back(static_cast(1.0 * j)); } - if (std::is_same::value) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (std::is_same::value || + std::is_same::value) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_CNCL) paddle::memory::Copy(place, data, cpu_place, value.data(), sizeof(T) * value.size(), 0); #endif @@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) { } #endif +#if defined(PADDLE_WITH_CNCL) +TEST(TestGroup, TestMLUConcatSplit) { + platform::MLUPlace mlu_place(0); + platform::CPUPlace cpu_place; + + int size = 3; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); + + size = 15; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); +} +#endif } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 14b12ca3acb19a13c8a5df51e35713a0132a83ef..bce927c32ddf7e9c78f7c2ba1be50e6929426d4f 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -5,6 +5,8 @@ endif() # please add new math_library in alphabetical order if (WITH_ASCEND_CL) math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) +elseif (WITH_MLU) +math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) else() math_library(concat_and_split DEPS concat_and_split_functor) endif() diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 46126ac59c892787d2f63956983404843e518ae7..c9308d27c0a3490d9c0094f45a1a9c2d894bbf57 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -18,6 +18,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#endif #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" @@ -226,6 +229,90 @@ class SplitFunctor { }; #endif +#ifdef PADDLE_WITH_MLU +template +class ConcatFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const std::vector& input, int axis, + framework::Tensor* output) { + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto ins_size = input.size(); + + const int axis_t = axis; + const int ins_size_t = ins_size; + auto place = context.GetPlace(); + output->mutable_data(place); + + // mlu should do sth + // init ins tensors + std::vector inputs; + std::vector input_descs; + std::vector desc_vector; + for (size_t i = 0; i < ins_size; i++) { + input_descs.emplace_back(MLUCnnlTensorDesc( + input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype()))); + desc_vector.push_back(input_descs.back().get()); + inputs.push_back(input[i].data()); + } + // init out tensors + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + // MLU should do sth + MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(), + inputs.data(), output_desc.get(), GetBasePtr(output)); + } +}; + +template +class SplitFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const framework::Tensor& input, + const std::vector& ref_inputs, + const int axis, std::vector* outputs) { + if (input.numel() == 0) { + return; + } + + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto in_dims = input.dims(); + auto out_size = outputs->size(); + + std::vector outs_dims(out_size, in_dims); + for (size_t i = 0; i < out_size; ++i) { + outs_dims[i][axis] = ref_inputs[i]->dims()[axis]; + } + + // init out tensors + std::vector vct_tensor; + std::vector output_descs; + std::vector desc_vector; + for (size_t i = 0; i < out_size; i++) { + (*outputs)[i]->Resize(outs_dims[i]); + (*outputs)[i]->mutable_data(context.GetPlace()); + output_descs.emplace_back( + MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, + ToCnnlDataType((*outputs)[i]->dtype()))); + desc_vector.push_back(output_descs.back().get()); + vct_tensor.push_back(GetBasePtr((*outputs)[i])); + } + // init in tensors + MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input.dtype())); + + // MLU should do sth + MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(), + desc_vector.data(), vct_tensor.data()); + } +}; +#endif + #define DEFINE_FUNCTOR(type) \ template class ConcatFunctor; \ template class SplitFunctor; @@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float) FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR) #endif +#ifdef PADDLE_WITH_MLU +#define DEFINE_MLU_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor; +DEFINE_MLU_FUNCTOR(float) +DEFINE_MLU_FUNCTOR(platform::float16) +DEFINE_MLU_FUNCTOR(int64_t) +DEFINE_MLU_FUNCTOR(bool) +DEFINE_MLU_FUNCTOR(int) +DEFINE_MLU_FUNCTOR(int8_t) +DEFINE_MLU_FUNCTOR(int16_t) +DEFINE_MLU_FUNCTOR(uint8_t) +#endif } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 9de03582cbbf53e843e5f4531a6da6c1c2a87dd5..1fdaa153e3c27ed1a83696bf03d68dbfd2b93ae9 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } +/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx, + const int pack_num, const int axis, + const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc, + inputs, workspace_ptr, workspace_size, + output_desc, output)); +} + /* static */ void MLUCnnl::Div( const ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t in0_desc, const void* in0, @@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_descs, output_ptrs)); } +/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num, + int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc, + input_ptr, workspace_ptr, workspace_size, + output_descs, output_ptrs)); +} + /* static */ void MLUCnnl::GatherFunctor( const ExecutionContext& ctx, const int axis, const int batch_dims, const cnnlTensorDescriptor_t params_desc, const void* params, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 2a54a8392c7c5bfbf450ee9351a9fda866a07663..b55b10686e92e2b1b5b3a7390289f8329ac04a04 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -403,6 +403,11 @@ class MLUCnnl { const void* const inputs[], const cnnlTensorDescriptor_t output_desc, void* output); + static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num, + const int axis, const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, void* output); + static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); @@ -566,6 +571,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_descs[], void* output_ptrs[]); + static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]); + static void Scale(const ExecutionContext& ctx, const int axis, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t alpha_desc, const void* alpha, diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c92d468f3462c92cd0631383996012afb6edb46b..af29aac6b9052877283271abc12f4dc1da6b8a3e 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, auto& npu_ctx = reinterpret_cast(ctx); memory::Copy(npu_place, dst + i * dst_after, npu_place, src + i * src_after, sizeof(T) * size, npu_ctx.stream()); +#elif defined(PADDLE_WITH_MLU) + auto& mlu_place = place; + auto& mlu_ctx = reinterpret_cast(ctx); + memory::Copy(mlu_place, dst + i * dst_after, mlu_place, + src + i * src_after, sizeof(T) * size, mlu_ctx.stream()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Paddle is not compiled with GPU."));