未验证 提交 e93e8a3f 编写于 作者: H huangjiyi 提交者: GitHub

update (#52878)

上级 aac8da90
......@@ -53,7 +53,7 @@ class GetFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
}
};
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class GetFloatStatusKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -75,4 +75,5 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(get_float_status, ops::GetFloatStatusKernel<CPU, float>);
PD_REGISTER_STRUCT_KERNEL(
get_float_status, CPU, ALL_LAYOUT, ops::GetFloatStatusKernel, float) {}
......@@ -111,9 +111,12 @@ REGISTER_OPERATOR(global_gather,
ops::GlobalGatherOpGradMaker<paddle::framework::OpDesc>,
ops::GlobalGatherOpGradMaker<paddle::imperative::OpBase>)
REGISTER_OP_CPU_KERNEL(global_gather,
ops::GlobalGatherOpCPUKernel<float>,
ops::GlobalGatherOpCPUKernel<double>,
ops::GlobalGatherOpCPUKernel<int>,
ops::GlobalGatherOpCPUKernel<int64_t>,
ops::GlobalGatherOpCPUKernel<plat::float16>);
PD_REGISTER_STRUCT_KERNEL(global_gather,
CPU,
ALL_LAYOUT,
ops::GlobalGatherOpCPUKernel,
float,
double,
int,
int64_t,
plat::float16) {}
......@@ -261,7 +261,7 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
}
};
template <typename T>
template <typename T, typename DeivceContext>
class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -283,9 +283,12 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(global_gather,
ops::GlobalGatherOpCUDAKernel<float>,
ops::GlobalGatherOpCUDAKernel<double>,
ops::GlobalGatherOpCUDAKernel<int>,
ops::GlobalGatherOpCUDAKernel<int64_t>,
ops::GlobalGatherOpCUDAKernel<plat::float16>);
PD_REGISTER_STRUCT_KERNEL(global_gather,
GPU,
ALL_LAYOUT,
ops::GlobalGatherOpCUDAKernel,
float,
double,
int,
int64_t,
plat::float16) {}
......@@ -25,7 +25,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename DeviceContext>
class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -115,9 +115,12 @@ REGISTER_OPERATOR(global_scatter,
ops::GlobalScatterOpGradMaker<paddle::framework::OpDesc>,
ops::GlobalScatterOpGradMaker<paddle::imperative::OpBase>)
REGISTER_OP_CPU_KERNEL(global_scatter,
ops::GlobalScatterOpCPUKernel<float>,
ops::GlobalScatterOpCPUKernel<double>,
ops::GlobalScatterOpCPUKernel<int>,
ops::GlobalScatterOpCPUKernel<int64_t>,
ops::GlobalScatterOpCPUKernel<plat::float16>);
PD_REGISTER_STRUCT_KERNEL(global_scatter,
CPU,
ALL_LAYOUT,
ops::GlobalScatterOpCPUKernel,
float,
double,
int,
int64_t,
plat::float16) {}
......@@ -259,7 +259,7 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -281,9 +281,12 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(global_scatter,
ops::GlobalScatterOpCUDAKernel<float>,
ops::GlobalScatterOpCUDAKernel<double>,
ops::GlobalScatterOpCUDAKernel<int>,
ops::GlobalScatterOpCUDAKernel<int64_t>,
ops::GlobalScatterOpCUDAKernel<plat::float16>);
PD_REGISTER_STRUCT_KERNEL(global_scatter,
GPU,
ALL_LAYOUT,
ops::GlobalScatterOpCUDAKernel,
float,
double,
int,
int64_t,
plat::float16) {}
......@@ -25,7 +25,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename DeviceContext>
class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -328,7 +328,7 @@ std::vector<phi::DenseTensor> SampleMaskForOneImage(
return res;
}
template <typename T>
template <typename T, typename DeviceContext>
class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -533,5 +533,9 @@ REGISTER_OPERATOR(
ops::GenerateMaskLabelsOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(generate_mask_labels,
ops::GenerateMaskLabelsKernel<float>);
PD_REGISTER_STRUCT_KERNEL(generate_mask_labels,
CPU,
ALL_LAYOUT,
ops::GenerateMaskLabelsKernel,
float) {}
......@@ -510,7 +510,7 @@ std::vector<phi::DenseTensor> SampleRoisForOneImage(
return res;
}
template <typename T>
template <typename T, typename DeviceContext>
class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -811,9 +811,12 @@ REGISTER_OPERATOR(
ops::GenerateProposalLabelsOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(generate_proposal_labels,
ops::GenerateProposalLabelsKernel<float>,
ops::GenerateProposalLabelsKernel<double>);
PD_REGISTER_STRUCT_KERNEL(generate_proposal_labels,
CPU,
ALL_LAYOUT,
ops::GenerateProposalLabelsKernel,
float,
double) {}
REGISTER_OP_VERSION(generate_proposal_labels)
.AddCheckpoint(
......
......@@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename DeviceContext>
class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -99,7 +99,10 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
paddle::operators::BatchSizeLikeNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
gaussian_random_batch_size_like,
paddle::operators::CPUGaussianRandomBatchSizeLikeKernel<float>,
paddle::operators::CPUGaussianRandomBatchSizeLikeKernel<double>);
namespace ops = paddle::operators;
PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like,
CPU,
ALL_LAYOUT,
ops::CPUGaussianRandomBatchSizeLikeKernel,
float,
double) {}
......@@ -47,7 +47,7 @@ struct GaussianGenerator {
}
};
template <typename T>
template <typename T, typename DeviceContext>
class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -78,9 +78,12 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
gaussian_random_batch_size_like,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
paddle::platform::float16>,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<float>,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<double>);
namespace ops = paddle::operators;
namespace plat = paddle::platform;
PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like,
GPU,
ALL_LAYOUT,
ops::GPUGaussianRandomBatchSizeLikeKernel,
float,
double,
plat::float16) {}
......@@ -136,6 +136,10 @@ using CPU = phi::CPUContext;
REGISTER_OPERATOR(graph_khop_sampler,
ops::GraphKhopSamplerOP,
ops::GraphKhopSamplerOpMaker);
REGISTER_OP_CPU_KERNEL(graph_khop_sampler,
ops::GraphKhopSamplerOpKernel<CPU, int32_t>,
ops::GraphKhopSamplerOpKernel<CPU, int64_t>);
PD_REGISTER_STRUCT_KERNEL(graph_khop_sampler,
CPU,
ALL_LAYOUT,
ops::GraphKhopSamplerOpKernel,
int32_t,
int64_t) {}
......@@ -412,7 +412,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
thrust::raw_pointer_cast(values.data()));
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -668,6 +668,9 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
using CUDA = phi::GPUContext;
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(graph_khop_sampler,
ops::GraphKhopSamplerOpCUDAKernel<CUDA, int32_t>,
ops::GraphKhopSamplerOpCUDAKernel<CUDA, int64_t>);
PD_REGISTER_STRUCT_KERNEL(graph_khop_sampler,
GPU,
ALL_LAYOUT,
ops::GraphKhopSamplerOpCUDAKernel,
int32_t,
int64_t) {}
......@@ -191,7 +191,7 @@ void SampleNeighbors(const T* src,
}
}
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class GraphKhopSamplerOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......
......@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/group_norm_op.h"
#include <memory>
#include <string>
#include <unordered_map>
......
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <array>
#include <numeric>
#include <string>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using DataLayout = phi::DataLayout;
template <typename DeviceContext, typename T>
class GroupNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
const float epsilon = ctx.Attr<float>("epsilon");
auto* scale = ctx.Input<phi::DenseTensor>("Scale");
auto* bias = ctx.Input<phi::DenseTensor>("Bias");
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Output<phi::DenseTensor>("Y");
auto* mean = ctx.Output<phi::DenseTensor>("Mean");
auto* var = ctx.Output<phi::DenseTensor>("Variance");
const auto groups = ctx.Attr<int>("groups");
const auto x_dims = x->dims();
const int C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int group_size = C / groups;
y->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto* x_data = x->data<T>();
auto* y_data = y->data<T>();
auto* mean_data = mean->data<T>();
auto* var_data = var->data<T>();
const T* scale_data = nullptr;
if (scale) scale_data = scale->data<T>();
const T* bias_data = nullptr;
if (bias) bias_data = bias->data<T>();
int imsize = 1;
if (data_layout == DataLayout::kNCHW) {
for (int i = 2; i < x_dims.size(); ++i) {
imsize *= x_dims[i];
}
} else {
for (int i = 1; i < x_dims.size() - 1; ++i) {
imsize *= x_dims[i];
}
}
auto* iter_x_data = x_data;
auto* iter_y_data = y_data;
for (int bid = 0; bid < x_dims[0]; bid++) {
for (int gid = 0; gid < groups; gid++) {
const int64_t M = 8;
std::array<T, M> x_mean_arr;
std::array<T, M> x_var_arr;
std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
T x_mean = 0, x_var = 0;
int number =
std::min(group_size, static_cast<int>(C - gid * group_size));
auto* tmp_x = iter_x_data;
auto* x_src_data = iter_x_data;
auto* tmp_y = iter_y_data;
auto* y_src_data = iter_y_data;
if (data_layout == DataLayout::kNCHW) {
for (int cid = 0; cid < number; cid++) {
int imid;
for (imid = 0; imid < imsize - (imsize % M);
imid += M, iter_x_data += M) {
// TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
// in template class/function, before we complete high
// performance cpu vector extension, temporarily unrolling
// loop to get high precision and performance
x_mean_arr[0] += iter_x_data[0];
x_var_arr[0] += iter_x_data[0] * iter_x_data[0];
x_mean_arr[1] += iter_x_data[1];
x_var_arr[1] += iter_x_data[1] * iter_x_data[1];
x_mean_arr[2] += iter_x_data[2];
x_var_arr[2] += iter_x_data[2] * iter_x_data[2];
x_mean_arr[3] += iter_x_data[3];
x_var_arr[3] += iter_x_data[3] * iter_x_data[3];
x_mean_arr[4] += iter_x_data[4];
x_var_arr[4] += iter_x_data[4] * iter_x_data[4];
x_mean_arr[5] += iter_x_data[5];
x_var_arr[5] += iter_x_data[5] * iter_x_data[5];
x_mean_arr[6] += iter_x_data[6];
x_var_arr[6] += iter_x_data[6] * iter_x_data[6];
x_mean_arr[7] += iter_x_data[7];
x_var_arr[7] += iter_x_data[7] * iter_x_data[7];
}
x_mean =
std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
x_var =
std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
for (; imid < imsize; imid++, iter_x_data++) {
x_mean += iter_x_data[0];
x_var += iter_x_data[0] * iter_x_data[0];
}
}
} else {
for (int cid = 0; cid < number; cid++) {
iter_x_data = tmp_x + cid;
int imid;
for (imid = 0; imid < imsize - (imsize % M);
imid += M, iter_x_data += M * C) {
// TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
// in template class/function, before we complete high
// performance cpu vector extension, temporarily unrolling
// loop to get high precision and performance
x_mean_arr[0] += iter_x_data[0 * C];
x_var_arr[0] += iter_x_data[0 * C] * iter_x_data[0 * C];
x_mean_arr[1] += iter_x_data[1 * C];
x_var_arr[1] += iter_x_data[1 * C] * iter_x_data[1 * C];
x_mean_arr[2] += iter_x_data[2 * C];
x_var_arr[2] += iter_x_data[2 * C] * iter_x_data[2 * C];
x_mean_arr[3] += iter_x_data[3 * C];
x_var_arr[3] += iter_x_data[3 * C] * iter_x_data[3 * C];
x_mean_arr[4] += iter_x_data[4 * C];
x_var_arr[4] += iter_x_data[4 * C] * iter_x_data[4 * C];
x_mean_arr[5] += iter_x_data[5 * C];
x_var_arr[5] += iter_x_data[5 * C] * iter_x_data[5 * C];
x_mean_arr[6] += iter_x_data[6 * C];
x_var_arr[6] += iter_x_data[6 * C] * iter_x_data[6 * C];
x_mean_arr[7] += iter_x_data[7 * C];
x_var_arr[7] += iter_x_data[7 * C] * iter_x_data[7 * C];
}
x_mean =
std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
x_var =
std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
for (; imid < imsize; imid++, iter_x_data += C) {
x_mean += iter_x_data[0];
x_var += iter_x_data[0] * iter_x_data[0];
}
}
iter_x_data = tmp_x + group_size;
}
x_mean /= number * imsize;
x_var /= number * imsize;
x_var = std::max(x_var - x_mean * x_mean, T(0));
T var_inv = T(1) / std::sqrt(x_var + epsilon);
mean_data[bid * groups + gid] = x_mean;
var_data[bid * groups + gid] = x_var;
if (data_layout == DataLayout::kNCHW) {
for (int cid = 0; cid < number; cid++) {
for (int imid = 0; imid < imsize; imid++, tmp_x++, iter_y_data++) {
T val = (tmp_x[0] - x_mean) * var_inv;
if (scale_data) val *= scale_data[gid * group_size + cid];
if (bias_data) val += bias_data[gid * group_size + cid];
iter_y_data[0] = val;
}
}
} else {
for (int cid = 0; cid < number; cid++) {
tmp_x = x_src_data + cid;
iter_y_data = y_src_data + cid;
for (int imid = 0; imid < imsize;
imid++, tmp_x += C, iter_y_data += C) {
T val = (tmp_x[0] - x_mean) * var_inv;
if (scale_data) val *= scale_data[gid * group_size + cid];
if (bias_data) val += bias_data[gid * group_size + cid];
iter_y_data[0] = val;
}
}
iter_y_data = tmp_y + group_size;
}
}
if (data_layout == DataLayout::kNHWC) {
iter_x_data = x_data + (bid + 1) * C * imsize;
iter_y_data = y_data + (bid + 1) * C * imsize;
}
}
}
};
template <typename DeviceContext, typename T>
class GroupNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
const float epsilon = ctx.Attr<float>("epsilon");
auto* x = ctx.Input<phi::DenseTensor>("Y");
auto* var = ctx.Input<phi::DenseTensor>("Variance");
auto* scale = ctx.Input<phi::DenseTensor>("Scale");
auto* bias = ctx.Input<phi::DenseTensor>("Bias");
auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
const auto groups = ctx.Attr<int>("groups");
// init output
auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* d_scale =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
const auto& x_dims = x->dims();
const int C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int group_size = C / groups;
d_x->mutable_data<T>(ctx.GetPlace());
phi::funcs::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto* x_data = x->data<T>();
auto* d_x_data = d_x->data<T>();
auto* y_data = d_y->data<T>();
auto* var_data = var->data<T>();
T* d_scale_data = nullptr;
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_scale, static_cast<T>(0));
d_scale_data = d_scale->data<T>();
}
T* d_bias_data = nullptr;
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_bias, static_cast<T>(0));
d_bias_data = d_bias->data<T>();
}
const T* scale_data = nullptr;
if (scale) scale_data = scale->data<T>();
const T* bias_data = nullptr;
if (bias) bias_data = bias->data<T>();
int imsize = 1;
if (data_layout == DataLayout::kNCHW) {
for (int i = 2; i < x_dims.size(); ++i) {
imsize *= x_dims[i];
}
} else {
for (int i = 1; i < x_dims.size() - 1; ++i) {
imsize *= x_dims[i];
}
}
auto* iter_x_data = x_data;
auto* iter_d_x_data = d_x_data;
auto* iter_y_data = y_data;
for (int bid = 0; bid < x_dims[0]; bid++) {
for (int gid = 0; gid < groups; gid++) {
T x_var = var_data[bid * groups + gid];
T var_inv = 1.0 / sqrt(x_var + epsilon);
int number =
std::min(group_size, static_cast<int>(C - gid * group_size));
T number_inv = 1.0 / (number * imsize);
auto* tmp_x = iter_x_data;
auto* tmp_y = iter_y_data;
auto* tmp_d_x = iter_d_x_data;
auto* x_src_data = iter_x_data;
auto* y_src_data = iter_y_data;
auto* iter_x_data_backup = iter_x_data;
auto* iter_y_data_backup = iter_y_data;
auto* iter_d_x_data_backup = iter_d_x_data;
T dp_scale = 0, dp_bias = 0;
if (data_layout == DataLayout::kNCHW) {
for (int cid = 0; cid < number; cid++) {
for (int imid = 0; imid < imsize;
imid++, iter_x_data++, iter_y_data++) {
T val = iter_x_data[0];
if (bias_data) val -= bias_data[gid * group_size + cid];
T dval = iter_y_data[0];
dp_scale += val * dval;
if (scale_data)
dp_bias += dval * scale_data[gid * group_size + cid];
if (scale_data && scale_data[gid * group_size + cid] != 0)
val /= scale_data[gid * group_size + cid];
if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
if (d_scale_data)
d_scale_data[gid * group_size + cid] += val * dval;
}
}
for (int cid = 0; cid < number; cid++) {
for (int imid = 0; imid < imsize;
imid++, iter_d_x_data++, tmp_x++, tmp_y++) {
T v_y = tmp_x[0];
T dly = tmp_y[0];
T dss = dp_scale;
T dbs = dp_bias;
T v_scale = 1., v_bias = 0.;
if (scale_data) v_scale = scale_data[gid * group_size + cid];
if (bias_data) v_bias = bias_data[gid * group_size + cid];
v_y -= v_bias;
if (v_scale != 0) v_y /= v_scale;
iter_d_x_data[0] =
(dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
var_inv;
}
}
} else {
for (int cid = 0; cid < number; cid++) {
iter_x_data = x_src_data + cid;
iter_y_data = y_src_data + cid;
for (int imid = 0; imid < imsize;
imid++, iter_x_data += C, iter_y_data += C) {
T val = iter_x_data[0];
if (bias_data) val -= bias_data[gid * group_size + cid];
T dval = iter_y_data[0];
dp_scale += val * dval;
if (scale_data)
dp_bias += dval * scale_data[gid * group_size + cid];
if (scale_data && scale_data[gid * group_size + cid] != 0)
val /= scale_data[gid * group_size + cid];
if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
if (d_scale_data)
d_scale_data[gid * group_size + cid] += val * dval;
}
}
for (int cid = 0; cid < number; cid++) {
tmp_x = x_src_data + cid;
tmp_y = y_src_data + cid;
iter_d_x_data = tmp_d_x + cid;
for (int imid = 0; imid < imsize;
imid++, iter_d_x_data += C, tmp_x += C, tmp_y += C) {
T v_y = tmp_x[0];
T dly = tmp_y[0];
T dss = dp_scale;
T dbs = dp_bias;
T v_scale = 1.0, v_bias = 0.;
if (scale_data) v_scale = scale_data[gid * group_size + cid];
if (bias_data) v_bias = bias_data[gid * group_size + cid];
v_y -= v_bias;
if (v_scale != 0) v_y /= v_scale;
iter_d_x_data[0] =
(dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
var_inv;
}
}
iter_x_data = iter_x_data_backup + group_size;
iter_y_data = iter_y_data_backup + group_size;
iter_d_x_data = iter_d_x_data_backup + group_size;
}
}
if (data_layout == DataLayout::kNHWC) {
iter_x_data = x_data + (bid + 1) * C * imsize;
iter_d_x_data = d_x_data + (bid + 1) * C * imsize;
iter_y_data = y_data + (bid + 1) * C * imsize;
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -91,10 +91,13 @@ REGISTER_OPERATOR(l1_norm,
ops::L1NormGradMaker<paddle::framework::OpDesc>,
ops::L1NormGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
REGISTER_OP_CPU_KERNEL(l1_norm, ops::L1NormKernel<phi::CPUContext, float>);
REGISTER_OP_CPU_KERNEL(l1_norm_grad,
ops::L1NormGradKernel<phi::CPUContext, float>);
REGISTER_OP_CUDA_KERNEL(l1_norm, ops::L1NormKernel<phi::GPUContext, float>);
REGISTER_OP_CUDA_KERNEL(l1_norm_grad,
ops::L1NormGradKernel<phi::GPUContext, float>);
PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
PD_REGISTER_STRUCT_KERNEL(
l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
PD_REGISTER_STRUCT_KERNEL(
l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
#endif
......@@ -21,7 +21,7 @@ namespace paddle {
namespace operators {
// Out = sum(abs(X))
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class L1NormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
......@@ -39,7 +39,7 @@ class L1NormKernel : public framework::OpKernel<T> {
};
// dX = dout * sign(X)
template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class L1NormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册