未验证 提交 e923642e 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove distribute header (#52202)

* remove distribute

* fix py3 bugs

* fix gpu-ps bugs

* fix compile bugs

* fix unittest bugs
上级 f22b9666
......@@ -65,3 +65,15 @@ if(WITH_CUSTOM_DEVICE)
comm_static_check
dense_tensor)
endif()
set(COMM_UTILS_DEPS process_group)
if(WITH_NCCL OR WITH_RCCL)
set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
endif()
if(WITH_CUSTOM_DEVICE)
set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_custom)
endif()
cc_library(
processgroup_comm_utils
SRCS processgroup_comm_utils.cc
DEPS ${COMM_UTILS_DEPS})
......@@ -20,7 +20,7 @@ cc_library(
graph_helper
SRCS graph_helper.cc
DEPS graph program_utils scale_loss_grad_op_handle
grad_merge_all_reduce_op_handle)
grad_merge_all_reduce_op_handle collective_helper)
cc_library(
pass
SRCS pass.cc
......
......@@ -162,6 +162,10 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils)
if(WITH_NCCL OR WITH_RCCL)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl)
endif()
if (WITH_GPU OR WITH_ROCM)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
endif()
......
......@@ -14,9 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/inplace_abn_op.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/sync_batch_norm_utils.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This File is used for compile margin_cross_entropy_op.cu.
// And this file will be deleted after margin_cross_entropy_op is moved to phi
......@@ -22,19 +22,25 @@ namespace cub = hipcub;
#include <vector>
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
#include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
// trace op include
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -467,6 +473,116 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
#endif
}
template <typename T, typename IndexT>
__global__ void CalculateGrad(T* logits_grad,
const T* loss_grad,
const T* logits,
const IndexT* label,
const float margin1,
const float margin2,
const float scale,
const int rank,
const int64_t N,
const int64_t D,
const int* class_interval_ptr) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
int start_index = class_interval_ptr[rank];
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
auto col = i % D;
if ((col + start_index) == label[row]) {
logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
MPType dout = static_cast<MPType>(logits_grad[i]);
MPType one = static_cast<MPType>(1.0f);
MPType x = static_cast<MPType>(logits[i]);
MPType m1 = static_cast<MPType>(margin1);
MPType m2 = static_cast<MPType>(margin2);
MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x);
logits_grad[i] = static_cast<T>(dout * d);
}
} else {
logits_grad[i] *= loss_grad[row];
}
if (fabs(scale - 1.0) > 1e-8) {
logits_grad[i] *= static_cast<T>(scale);
}
}
}
template <typename T, typename Context>
void MarginCrossEntropyGradKernel(const Context& dev_ctx,
const DenseTensor& logits,
const DenseTensor& label,
const DenseTensor& softmax,
const DenseTensor& loss_grad,
bool return_softmax,
int ring_id,
int rank,
int nranks,
float margin1,
float margin2,
float margin3,
float scale,
DenseTensor* logits_grad) {
const auto softmax_dims = softmax.dims();
const int axis = softmax_dims.size() - 1;
const int N = phi::funcs::SizeToAxis(axis, softmax_dims);
const int D = phi::funcs::SizeFromAxis(axis, softmax_dims);
if (return_softmax) {
phi::Copy<Context>(
dev_ctx, softmax, dev_ctx.GetPlace(), false, logits_grad);
} else {
logits_grad->ShareDataWith(softmax);
}
int blocks = NumBlocks(N * D);
int threads = kNumCUDAThreads;
const auto& label_type = label.dtype();
DenseTensor class_interval;
GetClassInterval<T, Context>(dev_ctx.stream(),
dev_ctx.GetPlace(),
dev_ctx,
ring_id,
rank,
nranks,
D,
&class_interval);
if (label_type == phi::DataType::INT32) {
typedef int32_t LabelT;
CalculateGrad<T, LabelT>
<<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
loss_grad.data<T>(),
logits.data<T>(),
label.data<LabelT>(),
margin1,
margin2,
scale,
rank,
N,
D,
class_interval.data<int>());
} else if (label_type == phi::DataType::INT64) {
typedef int64_t LabelT;
CalculateGrad<T, LabelT>
<<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
loss_grad.data<T>(),
logits.data<T>(),
label.data<LabelT>(),
margin1,
margin2,
scale,
rank,
N,
D,
class_interval.data<int>());
}
}
} // namespace phi
PD_REGISTER_KERNEL(margin_cross_entropy,
......@@ -476,3 +592,11 @@ PD_REGISTER_KERNEL(margin_cross_entropy,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(margin_cross_entropy_grad,
GPU,
ALL_LAYOUT,
phi::MarginCrossEntropyGradKernel,
float,
double,
phi::dtype::float16) {}
......@@ -12,33 +12,36 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
#include "paddle/fluid/operators/sync_batch_norm_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
// sparse header
#include "paddle/phi/kernels/sparse/empty_kernel.h"
namespace phi {
template <typename T, typename Context>
void SyncBatchNormKernel(const Context &ctx,
const DenseTensor &x,
const DenseTensor &mean,
const DenseTensor &variance,
const DenseTensor &scale,
const DenseTensor &bias,
void SyncBatchNormKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& mean,
const DenseTensor& variance,
const DenseTensor& scale,
const DenseTensor& bias,
bool is_test,
float momentum,
float epsilon_f,
const std::string &data_layout_str,
const std::string& data_layout_str,
bool use_global_stats,
bool trainable_statistics,
DenseTensor *y,
DenseTensor *mean_out,
DenseTensor *variance_out,
DenseTensor *saved_mean,
DenseTensor *saved_variance,
DenseTensor *reserve_space) {
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space) {
PADDLE_ENFORCE_EQ(use_global_stats,
false,
phi::errors::InvalidArgument(
......@@ -50,7 +53,7 @@ void SyncBatchNormKernel(const Context &ctx,
const bool trainable_stats = trainable_statistics;
const DataLayout layout = phi::StringToDataLayout(data_layout_str);
bool test_mode = is_test && (!trainable_statistics);
const auto &x_dims = x.dims();
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(x_dims.size(),
2,
phi::errors::InvalidArgument(
......@@ -63,14 +66,14 @@ void SyncBatchNormKernel(const Context &ctx,
funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
int x_numel = x.numel();
const T *x_d = x.template data<T>();
const auto *s_d = scale.template data<BatchNormParamType<T>>();
const auto *b_d = bias.template data<BatchNormParamType<T>>();
const T* x_d = x.template data<T>();
const auto* s_d = scale.template data<BatchNormParamType<T>>();
const auto* b_d = bias.template data<BatchNormParamType<T>>();
T *y_d = ctx.template Alloc<T>(y);
T* y_d = ctx.template Alloc<T>(y);
const BatchNormParamType<T> *mean_data = nullptr;
const BatchNormParamType<T> *var_data = nullptr;
const BatchNormParamType<T>* mean_data = nullptr;
const BatchNormParamType<T>* var_data = nullptr;
auto stream = ctx.stream();
const int block = 512;
......@@ -90,7 +93,7 @@ void SyncBatchNormKernel(const Context &ctx,
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
auto* stats = reinterpret_cast<BatchNormParamType<T>*>(alloc_ptr->ptr());
const int threads = 256;
int grid = std::min(C, (max_threads + threads - 1) / threads);
if (layout == phi::DataLayout::kNCHW) {
......@@ -122,12 +125,12 @@ void SyncBatchNormKernel(const Context &ctx,
}
#endif
auto *est_mean_data = ctx.template Alloc<BatchNormParamType<T>>(mean_out);
auto *est_var_data =
auto* est_mean_data = ctx.template Alloc<BatchNormParamType<T>>(mean_out);
auto* est_var_data =
ctx.template Alloc<BatchNormParamType<T>>(variance_out);
auto *sv_mean_data = ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
auto *sv_inv_var_data =
auto* sv_mean_data = ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
auto* sv_inv_var_data =
ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
// Note, Input('Mean')/Input('Variance') share variable with
......@@ -176,6 +179,127 @@ void SyncBatchNormKernel(const Context &ctx,
}
}
template <typename T, typename Context>
void SyncBatchNormGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
const paddle::optional<DenseTensor>& reserve_space,
const DenseTensor& y_grad,
float momentum,
float epsilon_f,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
SyncBatchNormGradFunctor<T, Context>(ctx,
&x,
nullptr,
scale,
bias,
saved_mean,
saved_variance,
y_grad,
epsilon_f,
data_layout_str,
x_grad,
scale_grad,
bias_grad);
}
} // namespace phi
namespace phi {
namespace sparse {
template <typename T, typename Context>
void SyncBatchNormCooKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& mean,
const DenseTensor& variance,
const DenseTensor& scale,
const DenseTensor& bias,
bool is_test,
float momentum,
float epsilon,
const std::string& data_layout,
bool use_global_stats,
bool trainable_statistics,
SparseCooTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space) {
EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
phi::SyncBatchNormKernel<T, Context>(dev_ctx,
x.values(),
mean,
variance,
scale,
bias,
is_test,
momentum,
epsilon,
data_layout,
use_global_stats,
trainable_statistics,
y->mutable_values(),
mean_out,
variance_out,
saved_mean,
saved_variance,
reserve_space);
y->SetIndicesDict(x.GetIndicesDict());
}
template <typename T, typename Context>
void SyncBatchNormCooGradKernel(
const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
const paddle::optional<DenseTensor>& reserve_space,
const SparseCooTensor& y_grad,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
SparseCooTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
*scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
*bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
x.values(),
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
y_grad.values(),
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
x_grad->mutable_values(),
scale_grad,
bias_grad);
}
} // namespace sparse
} // namespace phi
#ifdef PADDLE_WITH_HIP
......@@ -216,3 +340,54 @@ PD_REGISTER_KERNEL(sync_batch_norm,
}
}
#endif
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(sync_batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::SyncBatchNormGradKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(sync_batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::SyncBatchNormGradKernel,
float,
double,
phi::dtype::float16) {}
#endif
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(sync_batch_norm_coo,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(sync_batch_norm_coo,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooKernel,
float,
double,
phi::dtype::float16) {}
#endif
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooGradKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooGradKernel,
float,
double,
phi::dtype::float16) {}
#endif
......@@ -227,10 +227,18 @@ if(WITH_GPU)
SRCS device_event_test.cc
DEPS device_event_gpu)
endif()
nv_library(
cuda_graph_with_memory_pool
SRCS cuda_graph_with_memory_pool.cc
DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi_backends)
if(WITH_CUSTOM_DEVICE)
nv_library(
cuda_graph_with_memory_pool
SRCS cuda_graph_with_memory_pool.cc
DEPS ${DEVICE_EVENT_LIBS} device_event_custom_device device_context
allocator phi_backends)
else()
nv_library(
cuda_graph_with_memory_pool
SRCS cuda_graph_with_memory_pool.cc
DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi_backends)
endif()
nv_test(
device_context_test
SRCS device_context_test.cu
......
......@@ -74,15 +74,3 @@ if(WITH_CUSTOM_DEVICE)
SRCS custom/capi_test.cc
DEPS phi_capi)
endif()
set(COMM_UTILS_DEPS process_group)
if(WITH_NCCL OR WITH_RCCL)
set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
endif()
if(WITH_CUSTOM_DEVICE)
set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_custom)
endif()
cc_library(
processgroup_comm_utils
SRCS processgroup_comm_utils.cc
DEPS ${COMM_UTILS_DEPS})
......@@ -16,7 +16,7 @@ if(WITH_GLOO)
cc_library(
gloo_utils
SRCS gloo_utils.cc
DEPS gloo dense_tensor enforce)
DEPS gloo dense_tensor enforce tcp_store)
cc_library(
gloo_comm_context
......
......@@ -88,13 +88,11 @@ if(WITH_FLASHATTN)
endif()
if(WITH_NCCL OR WITH_RCCL)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group_nccl
nccl_comm_context)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} nccl_comm_context)
endif()
if(WITH_GLOO)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} gloo_comm_context)
endif()
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils)
if(WITH_CUDNN_FRONTEND)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cudnn-frontend)
endif()
......@@ -180,32 +178,24 @@ endif()
file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc")
add_library(phi_cpu ${kernel_cc})
kernel_declare("${kernel_cc}")
if(WITH_MKLDNN)
target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS}
get_kerneltype_forvar_utils)
else()
target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} get_kerneltype_forvar_utils)
endif()
set(ADD_PHI_KERNELS phi_cpu)
if(WITH_GPU OR WITH_ROCM)
if(WITH_GPU)
add_library(phi_gpu ${kernel_cu})
add_library(phi_gpu ${kernel_cu} ${kernel_cc})
if(WITH_CUTLASS)
add_dependencies(phi_gpu cutlass_codegen)
endif()
elseif(WITH_ROCM)
hip_add_library(phi_gpu STATIC ${kernel_cu})
hip_add_library(phi_gpu STATIC ${kernel_cu} ${kernel_cc})
endif()
kernel_declare("${kernel_cu}")
kernel_declare("${kernel_cc}")
target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS})
set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_gpu)
endif()
if(WITH_XPU)
elseif(WITH_XPU)
if(WITH_XPU_KP)
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
......@@ -215,15 +205,27 @@ if(WITH_XPU)
file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps")
endforeach()
file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps")
xpu_add_library(phi_xpu STATIC ${kernel_xpu} ${kernel_xpu_kps} DEPENDS
${COMMON_KERNEL_DEPS})
xpu_add_library(
phi_xpu
STATIC
${kernel_xpu}
${kernel_xpu_kps}
${kernel_cc}
DEPENDS
${COMMON_KERNEL_DEPS})
else()
add_library(phi_xpu ${kernel_xpu})
add_library(phi_xpu ${kernel_xpu} ${kernel_cc})
endif()
kernel_declare("${kernel_xpu}")
kernel_declare("${kernel_xpu_kps}")
kernel_declare("${kernel_cc}")
target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS})
set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu)
else()
add_library(phi_cpu ${kernel_cc})
target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
kernel_declare("${kernel_cc}")
set(ADD_PHI_KERNELS phi_cpu)
endif()
set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS})
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// old op include, fluid should be removed
#ifdef PADDLE_WITH_HIP
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#else
#include <cub/cub.cuh>
#endif
#include <vector>
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
#include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace phi {
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
template <typename T, typename Context>
void GetClassInterval(const gpuStream_t& stream,
const phi::Place& place,
const Context& dev_ctx,
const int rid,
const int rank,
const int nranks,
const int D,
DenseTensor* class_interval) {
std::vector<int> shard_dim_vec(nranks + 1, 0);
shard_dim_vec[rank + 1] = D;
if (nranks <= 1) {
phi::TensorFromVector(shard_dim_vec, dev_ctx, class_interval);
return;
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
DenseTensor num_classes_per_device;
phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device);
int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(rid)) {
// Use ProcessGroup
paddle::distributed::ProcessGroup* pg = map->get(rid);
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(num_classes_per_device);
out_tensor.push_back(num_classes_per_device);
paddle::distributed::AllreduceOptions opts;
opts.reduce_op = paddle::distributed::ReduceOp::SUM;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
const auto& comm =
paddle::platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream
const auto calcu_stream =
static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
->stream();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
num_classes_per_device_ptr,
num_classes_per_device_ptr,
num_classes_per_device.numel(),
phi::ToNCCLDataType(num_classes_per_device.dtype()),
ncclSum,
comm->comm(),
calcu_stream));
}
class_interval->Resize({nranks + 1});
auto class_interval_ptr = dev_ctx.template Alloc<int>(class_interval);
size_t cub_temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum<int*, int*>(
nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
auto cub_temp_storage =
phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
cub_temp_storage_bytes,
num_classes_per_device_ptr,
class_interval_ptr,
nranks + 1,
stream);
return;
#endif
}
template <typename T, typename IndexT>
__global__ void CalculateGrad(T* logits_grad,
const T* loss_grad,
const T* logits,
const IndexT* label,
const float margin1,
const float margin2,
const float scale,
const int rank,
const int64_t N,
const int64_t D,
const int* class_interval_ptr) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
int start_index = class_interval_ptr[rank];
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
auto col = i % D;
if ((col + start_index) == label[row]) {
logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
MPType dout = static_cast<MPType>(logits_grad[i]);
MPType one = static_cast<MPType>(1.0f);
MPType x = static_cast<MPType>(logits[i]);
MPType m1 = static_cast<MPType>(margin1);
MPType m2 = static_cast<MPType>(margin2);
MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x);
logits_grad[i] = static_cast<T>(dout * d);
}
} else {
logits_grad[i] *= loss_grad[row];
}
if (fabs(scale - 1.0) > 1e-8) {
logits_grad[i] *= static_cast<T>(scale);
}
}
}
template <typename T, typename Context>
void MarginCrossEntropyGradKernel(const Context& dev_ctx,
const DenseTensor& logits,
const DenseTensor& label,
const DenseTensor& softmax,
const DenseTensor& loss_grad,
bool return_softmax,
int ring_id,
int rank,
int nranks,
float margin1,
float margin2,
float margin3,
float scale,
DenseTensor* logits_grad) {
const auto softmax_dims = softmax.dims();
const int axis = softmax_dims.size() - 1;
const int N = phi::funcs::SizeToAxis(axis, softmax_dims);
const int D = phi::funcs::SizeFromAxis(axis, softmax_dims);
if (return_softmax) {
phi::Copy<Context>(
dev_ctx, softmax, dev_ctx.GetPlace(), false, logits_grad);
} else {
logits_grad->ShareDataWith(softmax);
}
int blocks = NumBlocks(N * D);
int threads = kNumCUDAThreads;
const auto& label_type = label.dtype();
DenseTensor class_interval;
GetClassInterval<T, Context>(dev_ctx.stream(),
dev_ctx.GetPlace(),
dev_ctx,
ring_id,
rank,
nranks,
D,
&class_interval);
if (label_type == phi::DataType::INT32) {
typedef int32_t LabelT;
CalculateGrad<T, LabelT>
<<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
loss_grad.data<T>(),
logits.data<T>(),
label.data<LabelT>(),
margin1,
margin2,
scale,
rank,
N,
D,
class_interval.data<int>());
} else if (label_type == phi::DataType::INT64) {
typedef int64_t LabelT;
CalculateGrad<T, LabelT>
<<<blocks, threads, 0, dev_ctx.stream()>>>(logits_grad->data<T>(),
loss_grad.data<T>(),
logits.data<T>(),
label.data<LabelT>(),
margin1,
margin2,
scale,
rank,
N,
D,
class_interval.data<int>());
}
}
} // namespace phi
PD_REGISTER_KERNEL(margin_cross_entropy_grad,
GPU,
ALL_LAYOUT,
phi::MarginCrossEntropyGradKernel,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
namespace phi {
template <typename T, typename Context>
void SyncBatchNormGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
const paddle::optional<DenseTensor>& reserve_space,
const DenseTensor& y_grad,
float momentum,
float epsilon_f,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
SyncBatchNormGradFunctor<T, Context>(ctx,
&x,
nullptr,
scale,
bias,
saved_mean,
saved_variance,
y_grad,
epsilon_f,
data_layout_str,
x_grad,
scale_grad,
bias_grad);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(sync_batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::SyncBatchNormGradKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(sync_batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::SyncBatchNormGradKernel,
float,
double,
phi::dtype::float16) {}
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
namespace phi {
namespace sparse {
template <typename T, typename Context>
void SyncBatchNormCooGradKernel(
const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
const paddle::optional<DenseTensor>& reserve_space,
const SparseCooTensor& y_grad,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
SparseCooTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
*scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
*bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
x.values(),
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
y_grad.values(),
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
x_grad->mutable_values(),
scale_grad,
bias_grad);
}
} // namespace sparse
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooGradKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooGradKernel,
float,
double,
phi::dtype::float16) {}
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/sync_batch_norm_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
namespace phi {
namespace sparse {
template <typename T, typename Context>
void SyncBatchNormCooKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& mean,
const DenseTensor& variance,
const DenseTensor& scale,
const DenseTensor& bias,
bool is_test,
float momentum,
float epsilon,
const std::string& data_layout,
bool use_global_stats,
bool trainable_statistics,
SparseCooTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space) {
EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
phi::SyncBatchNormKernel<T, Context>(dev_ctx,
x.values(),
mean,
variance,
scale,
bias,
is_test,
momentum,
epsilon,
data_layout,
use_global_stats,
trainable_statistics,
y->mutable_values(),
mean_out,
variance_out,
saved_mean,
saved_variance,
reserve_space);
y->SetIndicesDict(x.GetIndicesDict());
}
} // namespace sparse
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(sync_batch_norm_coo,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(sync_batch_norm_coo,
GPU,
ALL_LAYOUT,
phi::sparse::SyncBatchNormCooKernel,
float,
double,
phi::dtype::float16) {}
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册