未验证 提交 767050d9 编写于 作者: Y Yiqun Liu 提交者: GitHub

Implement the grad and enhance the cache of norm_convolution fusion ops. (#36168)

上级 b3d2dc7b
...@@ -15,8 +15,10 @@ limitations under the License. */ ...@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <mutex>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "glog/logging.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -14,10 +14,8 @@ limitations under the License. */ ...@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/cudnn_desc.h" #include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -41,12 +39,9 @@ class CudnnFusionOp { ...@@ -41,12 +39,9 @@ class CudnnFusionOp {
} }
~CudnnFusionOp() { ~CudnnFusionOp() {
// New 'fused op' descriptor destruction dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_);
PADDLE_ENFORCE_CUDA_SUCCESS( dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_);
dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_)); dynload::cudnnDestroyFusedOpsPlan(op_);
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
} }
// Execute fused op // Execute fused op
...@@ -121,41 +116,49 @@ class CudnnFusionOp { ...@@ -121,41 +116,49 @@ class CudnnFusionOp {
// Get the workspace, which is required before Execute(). // Get the workspace, which is required before Execute().
size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) { size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) {
size_t workspace_bytes = 0U; if (!plan_created_) {
workspace_bytes_ = 0U;
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
cudnn_handle, op_, op_const_params_, &workspace_bytes)); cudnn_handle, op_, op_const_params_, &workspace_bytes_));
plan_created_ = true; plan_created_ = true;
return workspace_bytes; }
return workspace_bytes_;
} }
private: private:
bool plan_created_; bool plan_created_;
size_t workspace_bytes_;
cudnnFusedOpsPlan_t op_; cudnnFusedOpsPlan_t op_;
cudnnFusedOpsConstParamPack_t op_const_params_; cudnnFusedOpsConstParamPack_t op_const_params_;
cudnnFusedOpsVariantParamPack_t op_variant_params_; cudnnFusedOpsVariantParamPack_t op_variant_params_;
}; };
static inline std::vector<int> GetStrides(const std::vector<int> &shape) { class CudnnFusionOpCache {
if (shape.size() < 1) { public:
return {}; static CudnnFusionOpCache &Instance() {
} static CudnnFusionOpCache instance;
int dim = static_cast<int>(shape.size()); return instance;
std::vector<int> pro_shape(shape); }
std::vector<int> strides(dim);
int temp = pro_shape[1]; framework::AlgorithmsCache<CudnnFusionOp *> *GetForward() {
pro_shape.erase(pro_shape.begin() + 1); return &forward_cache_;
pro_shape.push_back(temp); }
strides.back() = 1; framework::AlgorithmsCache<CudnnFusionOp *> *GetBackward() {
for (int i = dim - 2; i >= 0; --i) { return &backward_cache_;
strides[i] = strides[i + 1] * pro_shape[i + 1]; }
}
strides.pop_back(); private:
strides.insert(strides.begin() + 1, 1); CudnnFusionOpCache() {}
return strides; ~CudnnFusionOpCache() {
} // Need to delete the memory of cache.
}
static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } CudnnFusionOpCache(const CudnnFusionOpCache &) {}
private:
framework::AlgorithmsCache<CudnnFusionOp *> forward_cache_;
framework::AlgorithmsCache<CudnnFusionOp *> backward_cache_;
};
#endif // CUDNN_VERSION >= 8000 #endif // CUDNN_VERSION >= 8000
} // namespace operators } // namespace operators
......
...@@ -15,125 +15,320 @@ limitations under the License. */ ...@@ -15,125 +15,320 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
namespace dynload = platform::dynload; namespace dynload = platform::dynload;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
#if CUDNN_VERSION >= 8000 #if CUDNN_VERSION >= 8000
static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; }
template <typename T> template <typename T>
class CudnnNormConvolutionOp { struct NormConvolutionArgs {
public: NormConvolutionArgs() {
CudnnNormConvolutionOp() dtype = platform::CudnnDataType<T>::type;
: fwd_op_(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS) {} format = CUDNN_TENSOR_NHWC;
~CudnnNormConvolutionOp() {} compute_type = platform::CudnnDataType<float>::type;
}
void Set(const std::vector<int> &input_shape,
const std::vector<int> &filter_shape,
const std::vector<int> &output_shape, int padding, int stride,
int dilation, int group) {
PADDLE_ENFORCE_EQ(
input_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of input_shape is expected to 4. But recieved "
"input_shape's size is %d, input_shape is [%s].",
input_shape.size(), framework::make_ddim(input_shape)));
PADDLE_ENFORCE_EQ(
filter_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of filter_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s].",
filter_shape.size(), framework::make_ddim(filter_shape)));
PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] &&
(filter_shape[1] == 1 || filter_shape[1] == 3),
true,
platform::errors::InvalidArgument(
"The filter_shape is expected to store as nhwc, and "
"h = w = 1 or 3. But recieved filter_shape is [%s].",
framework::make_ddim(filter_shape)));
PADDLE_ENFORCE_EQ(
output_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of output_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s].",
output_shape.size(), framework::make_ddim(output_shape)));
for (size_t i = 0; i < input_shape.size(); ++i) {
in_dims.push_back(input_shape[i]);
}
for (size_t i = 0; i < filter_shape.size(); ++i) {
filter_dims.push_back(filter_shape[i]);
}
paddings = {padding, padding};
strides = {stride, stride};
dilations = {dilation, dilation};
in_desc.set(input_shape, format, dtype);
filter_desc.set(filter_shape, format, dtype, group);
out_desc.set(output_shape, format, dtype);
int output_channel = filter_shape[0];
std::vector<int> stats_shape = {1, 1, 1, output_channel};
out_stats_desc.set(stats_shape, format, compute_type);
conv_desc.set(dtype, paddings, strides, dilations, false, group);
}
void Init(const platform::CUDADeviceContext &ctx, cudnnDataType_t dtype;
cudnnTensorFormat_t format;
cudnnDataType_t compute_type;
std::vector<int64_t> in_dims;
std::vector<int64_t> filter_dims;
std::vector<int> strides;
std::vector<int> paddings;
std::vector<int> dilations;
platform::TensorDescriptor in_desc;
platform::FilterDescriptor filter_desc;
platform::TensorDescriptor out_desc;
platform::TensorDescriptor out_stats_desc;
platform::ConvolutionDescriptor conv_desc;
};
template <typename T>
class CudnnNormConvolution {
public:
CudnnNormConvolution(const platform::CUDADeviceContext &ctx,
const std::vector<int> &input_shape, const std::vector<int> &input_shape,
const std::vector<int> &filter_shape, const std::vector<int> &filter_shape,
const std::vector<int> &output_shape, const int &pad, const std::vector<int> &output_shape, const int &padding,
const int &stride, const int &dilate, const int &group) { const int &stride, const int &dilation,
cudnn_fwd_compute_type_ = platform::CudnnDataType<float>::type; const int &group) {
dtype_ = platform::CudnnDataType<T>::type; args_.Set(input_shape, filter_shape, output_shape, padding, stride,
format_ = CUDNN_TENSOR_NHWC; dilation, group);
InitDescriptors(ctx, input_shape, filter_shape, output_shape, pad, stride,
dilate, group);
GetWorkspaceSize(ctx);
} }
~CudnnNormConvolution() {}
void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr, void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr,
T *filter_ptr, T *output_ptr, float *sum_ptr, T *filter_ptr, T *output_ptr, float *sum_ptr,
float *sum_of_squares_ptr) { float *sum_of_squares_ptr) {
auto handle = ctx.cudnn_handle(); auto cudnn_handle = ctx.cudnn_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
CudnnFusionOp *fwd_op = GetForwardOp(ctx);
size_t workspace_size = RoundUp(
static_cast<int64_t>(fwd_op->GetWorkspaceSizeInBytes(cudnn_handle)),
512);
// Set variant_param // Set variant_param
// input ptr // input ptr
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr);
fwd_op_.SetOpVariantParamAttrPtr( fwd_op->SetOpVariantParamAttrPtr(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size);
// output ptr // output ptr
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);
workspace_handle.RunFunc(
ctx.cudnn_workspace_handle().RunFunc(
[&](void *workspace_ptr) { [&](void *workspace_ptr) {
// workspace ptr // workspace ptr
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
// fused op execute // fused op execute
fwd_op_.Execute(handle); fwd_op->Execute(cudnn_handle);
}, },
fwd_workspace_byte_); workspace_size);
} }
// TBD
void Backward(const platform::CUDADeviceContext &ctx) {}
private: private:
void InitDescriptors(const platform::CUDADeviceContext &ctx, CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) {
const std::vector<int> &input_shape, framework::AlgorithmsCache<CudnnFusionOp *> &cache =
const std::vector<int> &filter_shape, *(CudnnFusionOpCache::Instance().GetForward());
const std::vector<int> &output_shape, const int &pad,
const int &stride, const int &dilate, const int &group) { CudnnFusionOp *fwd_op = cache.GetAlgorithm(
args_.in_dims, args_.filter_dims, args_.strides, args_.paddings,
args_.dilations, 0, static_cast<int64_t>(args_.dtype), [&]() {
CudnnFusionOp *fwd_op =
new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS);
// Set constant_param // Set constant_param
fwd_op_.SetOpConstParamAttr( fwd_op->SetOpConstParamAttr(
{CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER,
CUDNN_PARAM_YDATA_PLACEHOLDER}, CUDNN_PARAM_YDATA_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED); CUDNN_PTR_16B_ALIGNED);
fwd_op_.SetOpConstParamAttr( fwd_op->SetOpConstParamAttr(
{CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED); CUDNN_PTR_16B_ALIGNED);
std::vector<int> pad_vec = {pad, pad}; // conv desc
std::vector<int> stride_vec = {stride, stride}; fwd_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC,
std::vector<int> dilate_vec = {dilate, dilate}; args_.conv_desc.desc());
int output_channel = filter_shape[0]; // input desc
std::vector<int> stats_shape = {1, 1, 1, output_channel}; fwd_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc());
// filter desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_WDESC,
args_.filter_desc.desc());
// output desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc());
// output_stats desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC,
args_.out_stats_desc.desc());
// batch_norm mode
fwd_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
// set conv desc // Make cudnn fused ops plan
conv_desc_.set(dtype_, pad_vec, stride_vec, dilate_vec, false, group); fwd_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle());
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, conv_desc_.desc()); return fwd_op;
});
return fwd_op;
}
// set input desc private:
in_desc_.set(input_shape, format_, dtype_); NormConvolutionArgs<T> args_;
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, in_desc_.desc()); };
// set filter desc template <typename T>
filter_desc_.set(filter_shape, format_, dtype_, group); class CudnnNormConvolutionGrad {
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_WDESC, filter_desc_.desc()); public:
CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx,
const std::vector<int> &input_shape,
const std::vector<int> &filter_shape,
const std::vector<int> &output_shape,
const int &padding, const int &stride,
const int &dilation, const int &group) {
args_.Set(input_shape, filter_shape, output_shape, padding, stride,
dilation, group);
dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
~CudnnNormConvolutionGrad() {}
void Backward(const platform::CUDADeviceContext &ctx, T *input_ptr,
T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr,
T *filter_grad_ptr, bool use_addto = false) {
if (filter_grad_ptr) {
BackwardFilter(ctx, input_ptr, output_grad_ptr, filter_ptr,
filter_grad_ptr);
}
if (input_grad_ptr) {
BackwardData(ctx, input_ptr, output_grad_ptr, filter_ptr, input_grad_ptr,
use_addto);
}
}
private:
void BackwardFilter(const platform::CUDADeviceContext &ctx, T *input_ptr,
T *output_grad_ptr, T *filter_ptr, T *filter_grad_ptr) {
auto cudnn_handle = ctx.cudnn_handle();
// set output desc CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx);
out_desc_.set(output_shape, format_, dtype_); size_t workspace_size = RoundUp(
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, out_desc_.desc()); static_cast<int64_t>(wgrad_op->GetWorkspaceSizeInBytes(cudnn_handle)),
512);
// set output_stats desc wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr);
out_stats_desc_.set(stats_shape, format_, cudnn_fwd_compute_type_); wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, output_grad_ptr);
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DWDATA, filter_grad_ptr);
out_stats_desc_.desc()); wgrad_op->SetOpVariantParamAttrPtr(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size);
fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, CUDNN_BATCHNORM_SPATIAL); ctx.cudnn_workspace_handle().RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
workspace_ptr);
// fused op execute
wgrad_op->Execute(cudnn_handle);
},
workspace_size);
} }
void GetWorkspaceSize(const platform::CUDADeviceContext &ctx) { void BackwardData(const platform::CUDADeviceContext &ctx, T *input_ptr,
auto handle = ctx.cudnn_handle(); T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr,
fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); bool use_addto = false) {
auto cudnn_handle = ctx.cudnn_handle();
size_t workspace_size = GetWorkspaceSizeBwdData(ctx);
// Convolution dgrad followed optionally by batchnorm dgrad
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
ctx.cudnn_workspace_handle().RunFunc(
[&](void *cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr,
args_.out_desc.desc(), output_grad_ptr,
args_.conv_desc.desc(), dgrad_algo_, cudnn_workspace_ptr,
workspace_size, &beta, args_.in_desc.desc(), input_grad_ptr));
},
workspace_size);
} }
size_t fwd_workspace_byte_ = 0; CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) {
framework::AlgorithmsCache<CudnnFusionOp *> &cache =
*(CudnnFusionOpCache::Instance().GetBackward());
CudnnFusionOp *wgrad_op = cache.GetAlgorithm(
args_.in_dims, args_.filter_dims, args_.strides, args_.paddings,
args_.dilations, 0, static_cast<int64_t>(args_.dtype), [&]() {
CudnnFusionOp *wgrad_op =
new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD);
wgrad_op->SetOpConstParamAttr(
{CUDNN_PARAM_DYDATA_PLACEHOLDER, CUDNN_PARAM_XDATA_PLACEHOLDER,
CUDNN_PARAM_DWDATA_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
// conv desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC,
args_.conv_desc.desc());
// input desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC,
args_.in_desc.desc());
// filter desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DWDESC,
args_.filter_desc.desc());
// output desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DYDESC,
args_.out_desc.desc());
wgrad_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
// Make cudnn fused ops plan
wgrad_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle());
return wgrad_op;
});
return wgrad_op;
}
cudnnDataType_t dtype_; size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) {
cudnnDataType_t cudnn_fwd_compute_type_; size_t workspace_size = 0U;
platform::TensorDescriptor in_desc_; auto handle = ctx.cudnn_handle();
platform::FilterDescriptor filter_desc_; PADDLE_ENFORCE_CUDA_SUCCESS(
platform::TensorDescriptor out_desc_; platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
platform::TensorDescriptor out_stats_desc_; handle, args_.filter_desc.desc(), args_.out_desc.desc(),
platform::ConvolutionDescriptor conv_desc_; args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_,
cudnnTensorFormat_t format_; &workspace_size));
return RoundUp(workspace_size, 512);
}
CudnnFusionOp fwd_op_; private:
NormConvolutionArgs<T> args_;
cudnnConvolutionBwdDataAlgo_t dgrad_algo_;
}; };
#endif #endif
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册