未验证 提交 a679fcbb 编写于 作者: Z Zhang Zheng 提交者: GitHub

Add more tests and fix bugs for cudnn_norm_conv_test and cudnn_bn_and_relu_test (#36314)

上级 830debc2
...@@ -33,6 +33,8 @@ namespace op = paddle::operators; ...@@ -33,6 +33,8 @@ namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
USE_OP(batch_norm); USE_OP(batch_norm);
USE_CUDA_ONLY_OP(fused_bn_add_activation);
USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
template <typename T> template <typename T>
void InitRandomTensor(const std::vector<int64_t> &dims, void InitRandomTensor(const std::vector<int64_t> &dims,
...@@ -40,7 +42,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims, ...@@ -40,7 +42,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
T *cpu_out_ptr = cpu_out->mutable_data<T>(framework::make_ddim(dims), T *cpu_out_ptr = cpu_out->mutable_data<T>(framework::make_ddim(dims),
platform::CPUPlace()); platform::CPUPlace());
std::default_random_engine random(0); std::default_random_engine random(0);
std::uniform_real_distribution<float> dis(0.0, 1.0); std::uniform_real_distribution<float> dis(-1.0, 1.0);
for (int i = 0; i < cpu_out->numel(); ++i) { for (int i = 0; i < cpu_out->numel(); ++i) {
cpu_out_ptr[i] = static_cast<T>(dis(random)); cpu_out_ptr[i] = static_cast<T>(dis(random));
} }
...@@ -89,7 +91,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res, ...@@ -89,7 +91,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res,
} }
} }
std::string error_type = is_relative_atol ? "relative" : "absolute"; std::string error_type = is_relative_atol ? "relative" : "absolute";
LOG(INFO) << "[" << name << "], The dims is [" << cpu_res.dims() LOG(INFO) << "[" << name << "] The dims is [" << cpu_res.dims()
<< "], maximum " << error_type << " error is " << max_diff << ": " << "], maximum " << error_type << " error is " << max_diff << ": "
<< cpu_res_ptr[index] << " vs " << cpu_base_ptr[index]; << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index];
} }
...@@ -121,13 +123,33 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x, ...@@ -121,13 +123,33 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
} }
} }
// get paddle batchnorm op results as baseline template <typename T>
void ComputeInplaceAdd(const framework::Tensor &cpu_x,
framework::Tensor *cpu_y) {
EXPECT_EQ(cpu_x.dims(), cpu_y->dims());
const T *cpu_x_ptr = cpu_x.data<T>();
T *cpu_y_ptr = cpu_y->data<T>();
for (int64_t i = 0; i < cpu_x.numel(); ++i) {
cpu_y_ptr[i] += cpu_x_ptr[i];
}
}
template <typename T>
void ComputeInplaceRelu(framework::Tensor *cpu_x) {
T *cpu_x_ptr = cpu_x->data<T>();
for (int64_t i = 0; i < cpu_x->numel(); ++i) {
cpu_x_ptr[i] =
cpu_x_ptr[i] > static_cast<T>(0) ? cpu_x_ptr[i] : static_cast<T>(0);
}
}
void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_x, const Tensor &cpu_scale,
const Tensor &cpu_bias, Tensor *cpu_mean, const Tensor &cpu_bias, Tensor *cpu_mean,
Tensor *cpu_var, Tensor *cpu_saved_mean, Tensor *cpu_var, Tensor *cpu_saved_mean,
Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_saved_var, Tensor *cpu_y,
Tensor *cpu_reserve_space) { Tensor *saved_reserve_space) {
framework::Scope scope; framework::Scope scope;
auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>(); auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>();
auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>(); auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>();
...@@ -178,68 +200,258 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, ...@@ -178,68 +200,258 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
TensorCopySync(*var, platform::CPUPlace(), cpu_var); TensorCopySync(*var, platform::CPUPlace(), cpu_var);
TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean);
TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var);
TensorCopySync(*reserve_space, platform::CPUPlace(), cpu_reserve_space); // reserved_space will stay on GPU and used in grad op.
saved_reserve_space->ShareDataWith(*reserve_space);
}
void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx,
const Tensor &cpu_x, const Tensor &cpu_z,
const Tensor &cpu_scale,
const Tensor &cpu_bias, Tensor *cpu_mean,
Tensor *cpu_var, Tensor *cpu_saved_mean,
Tensor *cpu_saved_var, Tensor *cpu_y,
Tensor *saved_reserve_space) {
framework::Scope scope;
auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>();
auto *z = scope.Var("Z")->GetMutable<framework::LoDTensor>();
auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>();
auto *bias = scope.Var("Bias")->GetMutable<framework::LoDTensor>();
auto *mean = scope.Var("Mean")->GetMutable<framework::LoDTensor>();
auto *var = scope.Var("Variance")->GetMutable<framework::LoDTensor>();
auto *y = scope.Var("Y")->GetMutable<framework::LoDTensor>();
auto *saved_mean = scope.Var("SavedMean")->GetMutable<framework::LoDTensor>();
auto *saved_var =
scope.Var("SavedVariance")->GetMutable<framework::LoDTensor>();
auto *reserve_space =
scope.Var("ReserveSpace")->GetMutable<framework::LoDTensor>();
auto place = ctx.GetPlace();
TensorCopySync(cpu_x, place, x);
TensorCopySync(cpu_z, place, z);
TensorCopySync(cpu_scale, place, scale);
TensorCopySync(cpu_bias, place, bias);
TensorCopySync(*cpu_mean, place, mean);
TensorCopySync(*cpu_var, place, var);
int64_t channels = x->dims()[3];
scale->Resize({channels});
bias->Resize({channels});
mean->Resize({channels});
var->Resize({channels});
framework::AttributeMap attrs;
auto op = framework::OpRegistry::CreateOp(
"fused_bn_add_activation",
{{"X", {"X"}}, {"Z", {"Z"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
{{"Y", {"Y"}},
{"MeanOut", {"Mean"}},
{"VarianceOut", {"Variance"}},
{"SavedMean", {"SavedMean"}},
{"SavedVariance", {"SavedVariance"}},
{"ReserveSpace", {"ReserveSpace"}}},
attrs);
op->Run(scope, ctx.GetPlace());
TensorCopySync(*y, platform::CPUPlace(), cpu_y);
TensorCopySync(*mean, platform::CPUPlace(), cpu_mean);
TensorCopySync(*var, platform::CPUPlace(), cpu_var);
TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean);
TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var);
// reserved_space will stay on GPU and used in grad op.
saved_reserve_space->ShareDataWith(*reserve_space);
}
void ComputeFusedBNAddReluBackward(
const platform::CUDADeviceContext &ctx, const Tensor &cpu_dy,
const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias,
const Tensor &cpu_saved_mean, const Tensor &cpu_saved_var,
const Tensor &cpu_y, const Tensor &saved_reserve_space, Tensor *cpu_dx,
Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) {
framework::Scope scope;
auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>();
auto *y = scope.Var("Y")->GetMutable<framework::LoDTensor>();
auto *dy = scope.Var("Y@GRAD")->GetMutable<framework::LoDTensor>();
auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>();
auto *bias = scope.Var("Bias")->GetMutable<framework::LoDTensor>();
auto *saved_mean = scope.Var("SavedMean")->GetMutable<framework::LoDTensor>();
auto *saved_var =
scope.Var("SavedVariance")->GetMutable<framework::LoDTensor>();
auto *reserve_space =
scope.Var("ReserveSpace")->GetMutable<framework::LoDTensor>();
auto *dx = scope.Var("X@GRAD")->GetMutable<framework::LoDTensor>();
auto *dz = scope.Var("Z@GRAD")->GetMutable<framework::LoDTensor>();
auto *dscale = scope.Var("Scale@GRAD")->GetMutable<framework::LoDTensor>();
auto *dbias = scope.Var("Bias@GRAD")->GetMutable<framework::LoDTensor>();
auto place = ctx.GetPlace();
TensorCopySync(cpu_x, place, x);
TensorCopySync(cpu_y, place, y);
TensorCopySync(cpu_dy, place, dy);
TensorCopySync(cpu_scale, place, scale);
TensorCopySync(cpu_bias, place, bias);
TensorCopySync(cpu_saved_mean, place, saved_mean);
TensorCopySync(cpu_saved_var, place, saved_var);
reserve_space->ShareDataWith(saved_reserve_space);
int64_t channels = x->dims()[3];
scale->Resize({channels});
bias->Resize({channels});
saved_mean->Resize({channels});
saved_var->Resize({channels});
framework::AttributeMap attrs;
float momentum = 0.9;
float epsilon = 1e-5;
std::string act_type = "relu";
attrs.insert({"momentum", momentum});
attrs.insert({"epsilon", epsilon});
attrs.insert({"act_type", act_type});
auto op = framework::OpRegistry::CreateOp(
"fused_bn_add_activation_grad", {{"X", {"X"}},
{"Y", {"Y"}},
{"Y@GRAD", {"Y@GRAD"}},
{"Scale", {"Scale"}},
{"Bias", {"Bias"}},
{"SavedMean", {"SavedMean"}},
{"SavedVariance", {"SavedVariance"}},
{"ReserveSpace", {"ReserveSpace"}}},
{{"X@GRAD", {"X@GRAD"}},
{"Z@GRAD", {"Z@GRAD"}},
{"Scale@GRAD", {"Scale@GRAD"}},
{"Bias@GRAD", {"Bias@GRAD"}}},
attrs);
op->Run(scope, ctx.GetPlace());
TensorCopySync(*dx, platform::CPUPlace(), cpu_dx);
TensorCopySync(*dz, platform::CPUPlace(), cpu_dz);
TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale);
TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias);
} }
template <typename T> template <typename T>
class CudnnBNAddReluTester { class CudnnBNAddReluTester {
public: public:
CudnnBNAddReluTester(int batch_size, int height, int width, int channels) { CudnnBNAddReluTester(int batch_size, int height, int width, int channels,
std::string act_type, bool fuse_add, bool has_shortcut) {
batch_size_ = batch_size; batch_size_ = batch_size;
height_ = height; height_ = height;
width_ = width; width_ = width;
channels_ = channels; channels_ = channels;
ele_count_ = batch_size_ * height_ * width_; ele_count_ = batch_size_ * height_ * width_;
act_type_ = act_type;
fuse_add_ = fuse_add;
has_shortcut_ = has_shortcut;
SetUp(); SetUp();
} }
~CudnnBNAddReluTester() {} ~CudnnBNAddReluTester() {}
void CheckForward(float diff, bool is_relative_atol = false) { void CheckForward(float diff, bool is_relative_atol = false) {
LOG(INFO) << "[CheckForward, diff=" << diff
<< ", is_relative_atol=" << is_relative_atol
<< "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_
<< ", has_shortcut=" << has_shortcut_;
platform::CUDADeviceContext *ctx = platform::CUDADeviceContext *ctx =
static_cast<platform::CUDADeviceContext *>( static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(0))); platform::CUDAPlace(0)));
framework::Tensor cpu_mean_base; auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; };
framework::Tensor cpu_var_base;
framework::Tensor cpu_saved_mean_base;
framework::Tensor cpu_saved_var_base;
framework::Tensor cpu_y_base;
framework::Tensor cpu_reserve_space_base;
BaselineForward(*ctx, &cpu_mean_base, &cpu_var_base, &cpu_saved_mean_base,
&cpu_saved_var_base, &cpu_y_base, &cpu_reserve_space_base);
framework::Tensor cpu_mean;
framework::Tensor cpu_var;
framework::Tensor cpu_saved_mean;
framework::Tensor cpu_saved_var;
framework::Tensor cpu_y;
framework::Tensor cpu_bitmask;
FusedForward(*ctx, &cpu_mean, &cpu_var, &cpu_saved_mean, &cpu_saved_var,
&cpu_y, &cpu_bitmask);
CheckOutput<float>("Mean", cpu_mean, cpu_mean_base, diff, is_relative_atol); framework::Tensor cpu_mean_base_x;
CheckOutput<float>("Variance", cpu_var, cpu_var_base, diff, framework::Tensor cpu_var_base_x;
framework::Tensor cpu_mean_base_z;
framework::Tensor cpu_var_base_z;
if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) {
BaselineForwardFusedBNAddRelu(
*ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_,
&cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_);
} else {
BaselineForward(
*ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_,
&cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_,
select(&cpu_mean_base_z), select(&cpu_var_base_z),
select(&cpu_saved_mean_base_z_), select(&cpu_saved_var_base_z_),
select(&saved_reserve_space_z_));
}
framework::Tensor cpu_mean_x;
framework::Tensor cpu_var_x;
framework::Tensor cpu_y;
framework::Tensor cpu_mean_z;
framework::Tensor cpu_var_z;
FusedForward(*ctx, &cpu_mean_x, &cpu_var_x, &cpu_saved_mean_x_,
&cpu_saved_var_x_, &cpu_y, &cpu_bitmask_, select(&cpu_mean_z),
select(&cpu_var_z), select(&cpu_saved_mean_z_),
select(&cpu_saved_var_z_));
CheckOutput<float>("Mean", cpu_mean_x, cpu_mean_base_x, diff,
is_relative_atol);
CheckOutput<float>("Variance", cpu_var_x, cpu_var_base_x, diff,
is_relative_atol); is_relative_atol);
CheckOutput<float>("SavedMean", cpu_saved_mean, cpu_saved_mean_base, diff, CheckOutput<float>("SavedMean", cpu_saved_mean_x_, cpu_saved_mean_base_x_,
diff, is_relative_atol);
CheckOutput<float>("SavedVariance", cpu_saved_var_x_, cpu_saved_var_base_x_,
diff, is_relative_atol);
if (has_shortcut_) {
CheckOutput<float>("MeanZ", cpu_mean_z, cpu_mean_base_z, diff,
is_relative_atol);
CheckOutput<float>("VarianceZ", cpu_var_z, cpu_var_base_z, diff,
is_relative_atol);
CheckOutput<float>("SavedMeanZ", cpu_saved_mean_z_,
cpu_saved_mean_base_z_, diff, is_relative_atol);
CheckOutput<float>("SavedVarianceZ", cpu_saved_var_z_,
cpu_saved_var_base_z_, diff, is_relative_atol);
}
CheckOutput<T>("Y", cpu_y, cpu_y_base_, diff, is_relative_atol);
}
void CheckBackward(float diff, bool is_relative_atol = false) {
platform::CUDADeviceContext *ctx =
static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(0)));
framework::Tensor cpu_dx_base;
framework::Tensor cpu_dz_base;
framework::Tensor cpu_dscale_base;
framework::Tensor cpu_dbias_base;
BaselineBackwardFusedBNAddRelu(*ctx, &cpu_dx_base, &cpu_dz_base,
&cpu_dscale_base, &cpu_dbias_base);
framework::Tensor cpu_dx;
framework::Tensor cpu_dz;
framework::Tensor cpu_dscale;
framework::Tensor cpu_dbias;
FusedBackward(*ctx, &cpu_dx, &cpu_dz, &cpu_dscale, &cpu_dbias);
CheckOutput<T>("DX", cpu_dx, cpu_dx_base, diff, is_relative_atol);
CheckOutput<T>("DZ", cpu_dz, cpu_dz_base, diff, is_relative_atol);
CheckOutput<float>("DScale", cpu_dscale, cpu_dscale_base, diff,
is_relative_atol); is_relative_atol);
CheckOutput<float>("SavedVariance", cpu_saved_var, cpu_saved_var_base, diff, CheckOutput<float>("DBias", cpu_dbias, cpu_dbias_base, diff,
is_relative_atol); is_relative_atol);
CheckOutput<T>("Y", cpu_y, cpu_y_base, diff, is_relative_atol);
} }
private: private:
void SetUp() { void SetUp() {
// Initialize input data
InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_x_); InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_x_);
ComputeSumAndSquareSum<T>(cpu_x_, &cpu_sum_, &cpu_sum_of_square_); InitRandomTensor<float>({channels_}, &cpu_bn_scale_x_);
InitRandomTensor<float>({channels_}, &cpu_bn_bias_x_);
// scale and bias should be initialized randomly. if (has_shortcut_) {
InitConstantTensor<float>({channels_}, static_cast<float>(1.0f), InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_z_);
&cpu_bn_scale_); InitRandomTensor<float>({channels_}, &cpu_bn_scale_z_);
InitConstantTensor<float>({channels_}, static_cast<float>(0.0f), InitRandomTensor<float>({channels_}, &cpu_bn_bias_z_);
&cpu_bn_bias_); } else {
if (fuse_add_) {
InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_z_);
}
}
InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_dy_);
} }
void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean,
...@@ -252,71 +464,178 @@ class CudnnBNAddReluTester { ...@@ -252,71 +464,178 @@ class CudnnBNAddReluTester {
cpu_saved_var); cpu_saved_var);
} }
void BaselineForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, void BaselineForward(const platform::CUDADeviceContext &ctx,
Tensor *cpu_var, Tensor *cpu_saved_mean, Tensor *cpu_mean_x, Tensor *cpu_var_x,
Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_saved_mean_x, Tensor *cpu_saved_var_x,
Tensor *cpu_reserve_space) { Tensor *cpu_y, Tensor *saved_reserve_space_x,
Tensor *cpu_mean_z = nullptr,
Tensor *cpu_var_z = nullptr,
Tensor *cpu_saved_mean_z = nullptr,
Tensor *cpu_saved_var_z = nullptr,
Tensor *saved_reserve_space_z = nullptr) {
InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_,
cpu_mean_x, cpu_var_x, cpu_saved_mean_x,
cpu_saved_var_x, cpu_y, saved_reserve_space_x);
if (has_shortcut_) {
framework::Tensor cpu_z_out;
InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z);
ComputeBatchNormForward(
ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, cpu_mean_z, cpu_var_z,
cpu_saved_mean_z, cpu_saved_var_z, &cpu_z_out, saved_reserve_space_z);
ComputeInplaceAdd<T>(cpu_z_out, cpu_y);
} else {
if (fuse_add_) {
ComputeInplaceAdd<T>(cpu_z_, cpu_y);
}
}
if (act_type_ == "relu") {
ComputeInplaceRelu<T>(cpu_y);
}
}
void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx,
Tensor *cpu_mean, Tensor *cpu_var,
Tensor *cpu_saved_mean,
Tensor *cpu_saved_var, Tensor *cpu_y,
Tensor *saved_reserve_space) {
InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var);
ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_, cpu_bn_bias_, cpu_mean, ComputeFusedBNAddReluForward(
cpu_var, cpu_saved_mean, cpu_saved_var, cpu_y, ctx, cpu_x_, cpu_z_, cpu_bn_scale_x_, cpu_bn_bias_x_, cpu_mean, cpu_var,
cpu_reserve_space); cpu_saved_mean, cpu_saved_var, cpu_y, saved_reserve_space);
}
void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx,
Tensor *cpu_dx, Tensor *cpu_dz,
Tensor *cpu_dscale, Tensor *cpu_dbias) {
ComputeFusedBNAddReluBackward(
ctx, cpu_dy_, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_,
cpu_saved_mean_base_x_, cpu_saved_var_base_x_, cpu_y_base_,
saved_reserve_space_x_, cpu_dx, cpu_dz, cpu_dscale, cpu_dbias);
}
void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx,
const Tensor &cpu_x,
const Tensor &cpu_bn_scale,
const Tensor &cpu_bn_bias, Tensor *sum,
Tensor *sum_of_square, Tensor *bn_scale,
Tensor *bn_bias, Tensor *mean, Tensor *var,
Tensor *saved_mean, Tensor *saved_var,
Tensor *equiv_scale, Tensor *equiv_bias) {
framework::Tensor cpu_sum;
framework::Tensor cpu_sum_of_square;
ComputeSumAndSquareSum<T>(cpu_x, &cpu_sum, &cpu_sum_of_square);
auto place = ctx.GetPlace();
TensorCopySync(cpu_sum, place, sum);
TensorCopySync(cpu_sum_of_square, place, sum_of_square);
TensorCopySync(cpu_bn_scale, place, bn_scale);
TensorCopySync(cpu_bn_bias, place, bn_bias);
bn_scale->Resize({1, 1, 1, channels_});
bn_bias->Resize({1, 1, 1, channels_});
// input
float *sum_ptr = sum->data<float>();
float *sum_of_square_ptr = sum_of_square->data<float>();
float *bn_scale_ptr = bn_scale->data<float>();
float *bn_bias_ptr = bn_bias->data<float>();
mean->Resize({1, 1, 1, channels_});
var->Resize({1, 1, 1, channels_});
// output
float *mean_ptr = mean->data<float>();
float *var_ptr = var->data<float>();
float *saved_mean_ptr =
saved_mean->mutable_data<float>({1, 1, 1, channels_}, place);
float *saved_var_ptr =
saved_var->mutable_data<float>({1, 1, 1, channels_}, place);
T *equiv_scale_ptr =
equiv_scale->mutable_data<T>({1, 1, 1, channels_}, place);
T *equiv_bias_ptr =
equiv_bias->mutable_data<T>({1, 1, 1, channels_}, place);
auto param_shape = framework::vectorize<int>(bn_scale->dims());
op::CudnnBNStatsFinalize<T> bn_op(ctx, param_shape);
bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr,
saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr,
equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_,
true);
} }
// Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean_x,
Tensor *cpu_var, Tensor *cpu_saved_mean, Tensor *cpu_var_x, Tensor *cpu_saved_mean_x,
Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_bitmask) { Tensor *cpu_saved_var_x, Tensor *cpu_y, Tensor *cpu_bitmask,
Tensor *cpu_mean_z = nullptr, Tensor *cpu_var_z = nullptr,
Tensor *cpu_saved_mean_z = nullptr,
Tensor *cpu_saved_var_z = nullptr) {
framework::Tensor x; framework::Tensor x;
framework::Tensor sum; framework::Tensor sum_x;
framework::Tensor sum_of_square; framework::Tensor sum_of_square_x;
framework::Tensor bn_scale; framework::Tensor bn_scale_x;
framework::Tensor bn_bias; framework::Tensor bn_bias_x;
framework::Tensor z;
framework::Tensor sum_z;
framework::Tensor sum_of_square_z;
framework::Tensor bn_scale_z;
framework::Tensor bn_bias_z;
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
TensorCopySync(cpu_x_, place, &x); TensorCopySync(cpu_x_, place, &x);
TensorCopySync(cpu_sum_, place, &sum); if (fuse_add_ || has_shortcut_) {
TensorCopySync(cpu_sum_of_square_, place, &sum_of_square); TensorCopySync(cpu_z_, place, &z);
TensorCopySync(cpu_bn_scale_, place, &bn_scale); }
TensorCopySync(cpu_bn_bias_, place, &bn_bias);
bn_scale.Resize({1, 1, 1, channels_}); framework::Tensor mean_x;
bn_bias.Resize({1, 1, 1, channels_}); framework::Tensor var_x;
framework::Tensor saved_mean_x;
framework::Tensor saved_var_x;
framework::Tensor equiv_scale_x;
framework::Tensor equiv_bias_x;
T *x_ptr = x.data<T>(); framework::Tensor mean_z;
float *sum_ptr = sum.data<float>(); framework::Tensor var_z;
float *sum_of_square_ptr = sum_of_square.data<float>(); framework::Tensor saved_mean_z;
float *bn_scale_ptr = bn_scale.data<float>(); framework::Tensor saved_var_z;
float *bn_bias_ptr = bn_bias.data<float>(); framework::Tensor equiv_scale_z;
framework::Tensor equiv_bias_z;
framework::Tensor mean;
framework::Tensor var;
framework::Tensor saved_mean;
framework::Tensor saved_var;
framework::Tensor equiv_scale;
framework::Tensor equiv_bias;
framework::Tensor y; framework::Tensor y;
framework::Tensor bitmask; framework::Tensor bitmask;
InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
TensorCopySync(*cpu_mean, place, &mean); TensorCopySync(*cpu_mean_x, place, &mean_x);
TensorCopySync(*cpu_var, place, &var); TensorCopySync(*cpu_var_x, place, &var_x);
if (has_shortcut_) {
InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z);
TensorCopySync(*cpu_mean_z, place, &mean_z);
TensorCopySync(*cpu_var_z, place, &var_z);
}
mean.Resize({1, 1, 1, channels_}); // 1. BN Stats Finalize
var.Resize({1, 1, 1, channels_}); ComputeFusedBNStatsFinalize(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_,
&sum_x, &sum_of_square_x, &bn_scale_x,
&bn_bias_x, &mean_x, &var_x, &saved_mean_x,
&saved_var_x, &equiv_scale_x, &equiv_bias_x);
if (has_shortcut_) {
ComputeFusedBNStatsFinalize(ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_,
&sum_z, &sum_of_square_z, &bn_scale_z,
&bn_bias_z, &mean_z, &var_z, &saved_mean_z,
&saved_var_z, &equiv_scale_z, &equiv_bias_z);
}
float *mean_ptr = mean.data<float>(); T *x_ptr = x.data<T>();
float *var_ptr = var.data<float>(); T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data<T>() : nullptr;
float *saved_mean_ptr = T *equiv_scale_x_ptr = equiv_scale_x.data<T>();
saved_mean.mutable_data<float>({1, 1, 1, channels_}, place); T *equiv_bias_x_ptr = equiv_bias_x.data<T>();
float *saved_var_ptr = T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data<T>() : nullptr;
saved_var.mutable_data<float>({1, 1, 1, channels_}, place); T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data<T>() : nullptr;
T *equiv_scale_ptr =
equiv_scale.mutable_data<T>({1, 1, 1, channels_}, place);
T *equiv_bias_ptr = equiv_bias.mutable_data<T>({1, 1, 1, channels_}, place);
T *y_ptr = T *y_ptr =
y.mutable_data<T>({batch_size_, height_, width_, channels_}, place); y.mutable_data<T>({batch_size_, height_, width_, channels_}, place);
// bitmask
int c = channels_; int c = channels_;
int64_t nhw = ele_count_; int64_t nhw = ele_count_;
int32_t c_int32_elems = ((c + 63) & ~63) / 32; int32_t c_int32_elems = ((c + 63) & ~63) / 32;
...@@ -325,31 +644,90 @@ class CudnnBNAddReluTester { ...@@ -325,31 +644,90 @@ class CudnnBNAddReluTester {
{nhw_int32_elems, c_int32_elems, 1}, place); {nhw_int32_elems, c_int32_elems, 1}, place);
auto data_shape = framework::vectorize<int>(x.dims()); auto data_shape = framework::vectorize<int>(x.dims());
auto param_shape = framework::vectorize<int>(bn_scale.dims()); auto param_shape = framework::vectorize<int>(bn_scale_x.dims());
auto bitmask_shape = framework::vectorize<int>(bitmask.dims()); auto bitmask_shape = framework::vectorize<int>(bitmask.dims());
// 1. BN Stats Finalize // 2. Scale Bias + Relu
op::CudnnBNStatsFinalize<T> bn_op(ctx, param_shape); op::CudnnScaleBiasAddRelu<T> sbar_op(ctx, act_type_, fuse_add_,
bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, has_shortcut_, data_shape, param_shape,
saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, bitmask_shape);
equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr,
true); bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr);
// 2. Scale Bias + Relu (not fused add) TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x);
std::string act_type = ""; TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x);
op::CudnnScaleBiasAddRelu<T> sbar_op( TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x);
ctx, act_type, false, false, data_shape, param_shape, bitmask_shape); TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x);
sbar_op.Forward(ctx, x_ptr, equiv_scale_ptr, equiv_bias_ptr, y_ptr, if (has_shortcut_) {
bitmask_ptr); TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z);
TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z);
TensorCopySync(mean, platform::CPUPlace(), cpu_mean); TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z);
TensorCopySync(var, platform::CPUPlace(), cpu_var); TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z);
TensorCopySync(saved_mean, platform::CPUPlace(), cpu_saved_mean); }
TensorCopySync(saved_var, platform::CPUPlace(), cpu_saved_var);
TensorCopySync(y, platform::CPUPlace(), cpu_y); TensorCopySync(y, platform::CPUPlace(), cpu_y);
TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask);
} }
// Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
void FusedBackward(const platform::CUDADeviceContext &ctx, Tensor *cpu_dx,
Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) {
framework::Tensor dy;
framework::Tensor x;
framework::Tensor bn_scale;
framework::Tensor bn_bias;
framework::Tensor saved_mean;
framework::Tensor saved_var;
framework::Tensor bitmask;
framework::Tensor dx;
framework::Tensor dz;
framework::Tensor dscale;
framework::Tensor dbias;
auto place = ctx.GetPlace();
TensorCopySync(cpu_dy_, place, &dy);
TensorCopySync(cpu_x_, place, &x);
TensorCopySync(cpu_bn_scale_x_, place, &bn_scale);
TensorCopySync(cpu_bn_bias_x_, place, &bn_bias);
TensorCopySync(cpu_saved_mean_x_, place, &saved_mean);
TensorCopySync(cpu_saved_var_x_, place, &saved_var);
TensorCopySync(cpu_bitmask_, place, &bitmask);
bn_scale.Resize({1, 1, 1, channels_});
bn_bias.Resize({1, 1, 1, channels_});
saved_mean.Resize({1, 1, 1, channels_});
saved_var.Resize({1, 1, 1, channels_});
T *dy_ptr = dy.data<T>();
T *x_ptr = x.data<T>();
float *bn_scale_ptr = bn_scale.data<float>();
float *bn_bias_ptr = bn_bias.data<float>();
float *saved_mean_ptr = saved_mean.data<float>();
float *saved_var_ptr = saved_var.data<float>();
int32_t *bitmask_ptr = bitmask.data<int32_t>();
T *dx_ptr =
dx.mutable_data<T>({batch_size_, height_, width_, channels_}, place);
T *dz_ptr =
dz.mutable_data<T>({batch_size_, height_, width_, channels_}, place);
float *dscale_ptr = dscale.mutable_data<float>({1, 1, 1, channels_}, place);
float *dbias_ptr = dbias.mutable_data<float>({1, 1, 1, channels_}, place);
auto data_shape = framework::vectorize<int>(x.dims());
auto param_shape = framework::vectorize<int>(bn_scale.dims());
auto bitmask_shape = framework::vectorize<int>(bitmask.dims());
std::string act_type = "relu";
op::CudnnScaleBiasAddRelu<T> sbar_op(ctx, act_type, true, false, data_shape,
param_shape, bitmask_shape);
sbar_op.Backward(ctx, dy_ptr, x_ptr, bn_scale_ptr, bn_bias_ptr,
saved_mean_ptr, saved_var_ptr, bitmask_ptr, dx_ptr, dz_ptr,
dscale_ptr, dbias_ptr, eps_);
TensorCopySync(dx, platform::CPUPlace(), cpu_dx);
TensorCopySync(dz, platform::CPUPlace(), cpu_dz);
TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale);
TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias);
}
private: private:
int batch_size_; int batch_size_;
int height_; int height_;
...@@ -357,24 +735,80 @@ class CudnnBNAddReluTester { ...@@ -357,24 +735,80 @@ class CudnnBNAddReluTester {
int channels_; int channels_;
int ele_count_; int ele_count_;
std::string act_type_;
bool fuse_add_;
bool has_shortcut_;
// Forward input // Forward input
framework::Tensor cpu_x_; framework::Tensor cpu_x_;
framework::Tensor cpu_sum_; framework::Tensor cpu_bn_scale_x_;
framework::Tensor cpu_sum_of_square_; framework::Tensor cpu_bn_bias_x_;
framework::Tensor cpu_bn_scale_; framework::Tensor cpu_z_;
framework::Tensor cpu_bn_bias_; framework::Tensor cpu_bn_scale_z_;
framework::Tensor cpu_bn_bias_z_;
// Backward input
framework::Tensor cpu_dy_;
framework::Tensor cpu_bitmask_;
framework::Tensor cpu_saved_mean_x_;
framework::Tensor cpu_saved_var_x_;
framework::Tensor cpu_saved_mean_z_;
framework::Tensor cpu_saved_var_z_;
framework::Tensor cpu_saved_mean_base_x_;
framework::Tensor cpu_saved_var_base_x_;
framework::Tensor saved_reserve_space_x_;
framework::Tensor cpu_saved_mean_base_z_;
framework::Tensor cpu_saved_var_base_z_;
framework::Tensor saved_reserve_space_z_;
framework::Tensor cpu_y_base_;
double eps_ = 1e-5; double eps_ = 1e-5;
float momentum_ = 0.9; float momentum_ = 0.9;
}; };
TEST(CudnnBNAddReluForward, GPUCudnnBNAddReluForwardFp16) { TEST(CudnnBNAddReluFp16, BNAdd) {
int batch_size = 4;
int height = 8;
int width = 8;
int channels = 64;
std::string act_type = "";
bool has_shortcut = false;
FLAGS_cudnn_batchnorm_spatial_persistent = true;
for (auto fuse_add : {false, true}) {
CudnnBNAddReluTester<paddle::platform::float16> test(
batch_size, height, width, channels, act_type, fuse_add, has_shortcut);
test.CheckForward(2e-3);
}
}
TEST(CudnnBNAddReluFp16, BNAddRelu) {
int batch_size = 4;
int height = 8;
int width = 8;
int channels = 64;
std::string act_type = "relu";
bool has_shortcut = false;
FLAGS_cudnn_batchnorm_spatial_persistent = true;
for (auto fuse_add : {false, true}) {
CudnnBNAddReluTester<paddle::platform::float16> test(
batch_size, height, width, channels, act_type, fuse_add, has_shortcut);
test.CheckForward(2e-3);
if (fuse_add) {
test.CheckBackward(2e-4);
}
}
}
TEST(CudnnBNAddReluFp16, HasShortcut) {
int batch_size = 4; int batch_size = 4;
int height = 8; int height = 8;
int width = 8; int width = 8;
int channels = 64; int channels = 64;
std::string act_type = "";
bool fuse_add = false;
bool has_shortcut = true;
FLAGS_cudnn_batchnorm_spatial_persistent = true; FLAGS_cudnn_batchnorm_spatial_persistent = true;
CudnnBNAddReluTester<paddle::platform::float16> test(batch_size, height, CudnnBNAddReluTester<paddle::platform::float16> test(
width, channels); batch_size, height, width, channels, act_type, fuse_add, has_shortcut);
test.CheckForward(2e-3); test.CheckForward(5e-3);
} }
...@@ -92,10 +92,9 @@ void CheckOutput(const framework::Tensor &cpu_res, ...@@ -92,10 +92,9 @@ void CheckOutput(const framework::Tensor &cpu_res,
} }
// Use Paddle conv2d op results as baseline // Use Paddle conv2d op results as baseline
template <typename T>
void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
const Tensor &cpu_input, const Tensor &cpu_filter, const Tensor &cpu_input, const Tensor &cpu_filter,
Tensor *cpu_output) { Tensor *cpu_output, int stride, int padding) {
framework::Scope scope; framework::Scope scope;
auto *input = scope.Var("Input")->GetMutable<framework::LoDTensor>(); auto *input = scope.Var("Input")->GetMutable<framework::LoDTensor>();
auto *filter = scope.Var("Filter")->GetMutable<framework::LoDTensor>(); auto *filter = scope.Var("Filter")->GetMutable<framework::LoDTensor>();
...@@ -108,10 +107,12 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, ...@@ -108,10 +107,12 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
framework::AttributeMap attrs; framework::AttributeMap attrs;
bool use_cudnn = true; bool use_cudnn = true;
std::string data_format = "NHWC"; std::string data_format = "NHWC";
std::string padding_algorithm = "SAME"; std::vector<int> strides = {stride, stride};
std::vector<int> paddings = {padding, padding};
attrs.insert({"strides", strides});
attrs.insert({"paddings", paddings});
attrs.insert({"use_cudnn", use_cudnn}); attrs.insert({"use_cudnn", use_cudnn});
attrs.insert({"data_format", data_format}); attrs.insert({"data_format", data_format});
attrs.insert({"padding_algorithm", padding_algorithm});
auto op = framework::OpRegistry::CreateOp( auto op = framework::OpRegistry::CreateOp(
"conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}}, "conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}},
...@@ -122,7 +123,6 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, ...@@ -122,7 +123,6 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
} }
// Use Paddle conv2d_grad op results as baseline // Use Paddle conv2d_grad op results as baseline
template <typename T>
void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
const Tensor &cpu_input, const Tensor &cpu_filter, const Tensor &cpu_input, const Tensor &cpu_filter,
const Tensor &cpu_output_grad, const Tensor &cpu_output_grad,
...@@ -147,7 +147,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, ...@@ -147,7 +147,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
framework::AttributeMap attrs; framework::AttributeMap attrs;
bool use_cudnn = true; bool use_cudnn = true;
std::string data_format = "NHWC"; std::string data_format = "NHWC";
std::string padding_algorithm = "SAME"; std::string padding_algorithm = "EXPLICIT";
std::vector<int> strides = {stride, stride}; std::vector<int> strides = {stride, stride};
std::vector<int> paddings = {padding, padding}; std::vector<int> paddings = {padding, padding};
std::vector<int> dilations = {dilation, dilation}; std::vector<int> dilations = {dilation, dilation};
...@@ -216,6 +216,8 @@ class CudnnNormConvolutionTester { ...@@ -216,6 +216,8 @@ class CudnnNormConvolutionTester {
kernel_size_ = kernel_size; kernel_size_ = kernel_size;
stride_ = stride; stride_ = stride;
padding_ = (kernel_size_ - 1) / 2; padding_ = (kernel_size_ - 1) / 2;
out_height_ = (height_ + 2 * padding_ - kernel_size_) / stride_ + 1;
out_width_ = (width_ + 2 * padding_ - kernel_size_) / stride_ + 1;
SetUp(); SetUp();
} }
...@@ -227,6 +229,15 @@ class CudnnNormConvolutionTester { ...@@ -227,6 +229,15 @@ class CudnnNormConvolutionTester {
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(0))); platform::CUDAPlace(0)));
if (!Support(*ctx)) {
LOG(INFO)
<< "Current test is only supported in the platforms with "
<< "compatiblity greater than or equal to 70 and the kernel size "
<< "must be equal to 1 or 3. Besides, when the kernel size is 1, "
<< "the stride must be 1 if the compatiblity is equal to 70.";
return;
}
framework::Tensor cpu_output_base; framework::Tensor cpu_output_base;
framework::Tensor cpu_sum_base; framework::Tensor cpu_sum_base;
framework::Tensor cpu_sum_of_square_base; framework::Tensor cpu_sum_of_square_base;
...@@ -277,15 +288,17 @@ class CudnnNormConvolutionTester { ...@@ -277,15 +288,17 @@ class CudnnNormConvolutionTester {
&cpu_filter_nchw_); &cpu_filter_nchw_);
// transpoes for filter, NCHW -> NHWC // transpoes for filter, NCHW -> NHWC
TransposeNchwToNhwc<T>(cpu_filter_nchw_, &cpu_filter_nhwc_); TransposeNchwToNhwc<T>(cpu_filter_nchw_, &cpu_filter_nhwc_);
InitRandomTensor<T>({batch_size_, height_, width_, output_channels_}, InitRandomTensor<T>(
&cpu_output_grad_); {batch_size_, out_height_, out_width_, output_channels_},
&cpu_output_grad_);
} }
void BaselineForward(const platform::CUDADeviceContext &ctx, void BaselineForward(const platform::CUDADeviceContext &ctx,
framework::Tensor *cpu_output_base, framework::Tensor *cpu_output_base,
framework::Tensor *cpu_sum_base, framework::Tensor *cpu_sum_base,
framework::Tensor *cpu_sum_of_square_base) { framework::Tensor *cpu_sum_of_square_base) {
ComputeConv2DForward<T>(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base); ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base,
stride_, padding_);
ComputeSumAndSquareSum<T>(*cpu_output_base, cpu_sum_base, ComputeSumAndSquareSum<T>(*cpu_output_base, cpu_sum_base,
cpu_sum_of_square_base); cpu_sum_of_square_base);
} }
...@@ -293,10 +306,9 @@ class CudnnNormConvolutionTester { ...@@ -293,10 +306,9 @@ class CudnnNormConvolutionTester {
void BaselineBackward(const platform::CUDADeviceContext &ctx, void BaselineBackward(const platform::CUDADeviceContext &ctx,
framework::Tensor *cpu_input_grad_base, framework::Tensor *cpu_input_grad_base,
framework::Tensor *cpu_filter_grad_base) { framework::Tensor *cpu_filter_grad_base) {
ComputeConv2DBackward<T>(ctx, cpu_input_, cpu_filter_nchw_, ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_grad_,
cpu_output_grad_, cpu_input_grad_base, cpu_input_grad_base, cpu_filter_grad_base, stride_,
cpu_filter_grad_base, stride_, padding_, padding_, dilation_);
dilation_);
} }
// get forward results of cudnn_norm_conv // get forward results of cudnn_norm_conv
...@@ -316,7 +328,7 @@ class CudnnNormConvolutionTester { ...@@ -316,7 +328,7 @@ class CudnnNormConvolutionTester {
T *input_ptr = input.data<T>(); T *input_ptr = input.data<T>();
T *filter_ptr = filter_nhwc.data<T>(); T *filter_ptr = filter_nhwc.data<T>();
T *output_ptr = output.mutable_data<T>( T *output_ptr = output.mutable_data<T>(
{batch_size_, height_, width_, output_channels_}, place); {batch_size_, out_height_, out_width_, output_channels_}, place);
float *sum_ptr = float *sum_ptr =
sum.mutable_data<float>({1, 1, 1, output_channels_}, place); sum.mutable_data<float>({1, 1, 1, output_channels_}, place);
float *sum_of_square_ptr = float *sum_of_square_ptr =
...@@ -369,10 +381,25 @@ class CudnnNormConvolutionTester { ...@@ -369,10 +381,25 @@ class CudnnNormConvolutionTester {
TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad);
} }
bool Support(const platform::CUDADeviceContext &ctx) {
if (ctx.GetComputeCapability() == 70) {
if ((kernel_size_ == 3) || ((kernel_size_ == 1) && (stride_ == 1))) {
return true;
}
} else if (ctx.GetComputeCapability() > 70) {
if ((kernel_size_ == 3) || (kernel_size_ == 1)) {
return true;
}
}
return false;
}
private: private:
int batch_size_; int batch_size_;
int height_; int height_;
int width_; int width_;
int out_height_;
int out_width_;
int input_channels_; int input_channels_;
int output_channels_; int output_channels_;
int kernel_size_; int kernel_size_;
...@@ -437,3 +464,19 @@ TEST(CudnnNormConvFp16, K1S1O4) { ...@@ -437,3 +464,19 @@ TEST(CudnnNormConvFp16, K1S1O4) {
test.CheckForward(1e-3, true); test.CheckForward(1e-3, true);
test.CheckBackward(1e-3, true); test.CheckBackward(1e-3, true);
} }
// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
TEST(CudnnNormConvFp16, K1S2O4) {
int batch_size = 4;
int height = 8;
int width = 8;
int input_channels = 32;
int output_channels = 128;
int kernel_size = 1;
int stride = 2;
CudnnNormConvolutionTester<paddle::platform::float16> test(
batch_size, height, width, input_channels, output_channels, kernel_size,
stride);
test.CheckForward(1e-3, true);
test.CheckBackward(1e-3);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册