未验证 提交 7ae461eb 编写于 作者: T tensor-tang 提交者: GitHub

[CPU] refine cpu softmax bwd (#17534)

* refine softmax fwd

test=develop

* refine cpu softmax bwd

test=develop

* fix batch size

test=develop

* fix compile issue with gpu

test=develop

* add value clip
上级 8818c94c
......@@ -176,6 +176,79 @@ inline void vec_sum<float, platform::avx>(const size_t n, const float* x,
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
for (size_t i = 0; i < n; ++i) {
z[i] = x[i] * y[i];
}
}
template <>
inline void vec_mul<float, platform::avx>(const size_t n, const float* x,
const float* y, float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul<float, platform::isa_any>(n, x, y, z);
return;
}
unsigned int i = 0, end = 0;
end = n & ~(block - 1);
for (i = 0; i < end; i += block) {
_mm256_storeu_ps(
z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}
for (; i < n; i++) {
z[i] = x[i] * y[i];
}
#else
vec_mul<float, platform::isa_any>(n, x, y, z);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
z[0] = x[0] * y[0];
for (size_t i = 1; i < n; ++i) {
z[0] += x[i] * y[i];
}
}
template <>
inline void vec_mul_reduce<float, platform::avx>(const size_t n, const float* x,
const float* y, float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
return;
}
unsigned int i = 0, end = 0;
z[0] = 0.f;
end = n & ~(block - 1);
__m256 tmp = _mm256_setzero_ps();
for (i = 0; i < end; i += block) {
tmp = _mm256_add_ps(
tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}
__m256 hsum = _mm256_hadd_ps(tmp, tmp);
hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
_mm_store_ss(z, _mm_hadd_ps(_mm256_castps256_ps128(hsum),
_mm256_castps256_ps128(hsum)));
for (; i < n; i++) {
z[0] += x[i] * y[i];
}
#else
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
......
......@@ -199,6 +199,70 @@ TEST(CpuVecTest, vec_clip) {
vec_clip<double, platform::isa_any>);
}
template <typename T>
void compare_mul(
size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
std::function<void(const size_t, const T*, const T*, T*)> ref) {
std::vector<T> x(n), y(n);
std::vector<T> ztgt(n), zref(n);
RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));
const T* x_data = x.data();
const T* y_data = y.data();
T* ztgt_data = ztgt.data();
T* zref_data = zref.data();
tgt(n, x_data, y_data, ztgt_data);
ref(n, x_data, y_data, zref_data);
for (size_t i = 0; i < n; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
TEST(CpuVecTest, vec_mul) {
namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
compare_mul<float>(sz, vec_mul<float, platform::avx>,
vec_mul<float, platform::isa_any>);
}
compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>);
}
template <typename T>
void compare_mul_reduce(
size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
std::function<void(const size_t, const T*, const T*, T*)> ref) {
std::vector<T> x(n), y(n);
T ztgt_data, zref_data;
RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));
const T* x_data = x.data();
const T* y_data = y.data();
tgt(n, x_data, y_data, &ztgt_data);
ref(n, x_data, y_data, &zref_data);
EXPECT_NEAR(ztgt_data, zref_data, 1e-3);
}
TEST(CpuVecTest, vec_mul_reduce) {
namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_mul_reduce<float>(sz, vec_mul_reduce<float>,
vec_mul_reduce<float, platform::isa_any>);
compare_mul_reduce<float>(sz, vec_mul_reduce<float, platform::avx>,
vec_mul_reduce<float, platform::isa_any>);
}
compare_mul_reduce<double>(30U, vec_mul_reduce<double>,
vec_mul_reduce<double, platform::isa_any>);
}
template <typename T>
void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
std::function<void(const int, const T*, T*)> ref) {
......
......@@ -27,7 +27,7 @@ class SoftmaxFunctor {
const framework::Tensor* X, framework::Tensor* Y);
};
template <typename DeviceContext, typename T>
template <typename DeviceContext, typename T, typename Enable = void>
class SoftmaxGradFunctor {
public:
void operator()(const DeviceContext& context, const int axis_dim,
......
......@@ -140,16 +140,16 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
};
template <typename DeviceContext, typename T>
void SoftmaxGradFunctor<DeviceContext, T>::operator()(
const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
void SoftmaxGradEigen(const DeviceContext& context, const int axis_dim,
const framework::Tensor* y,
const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
auto softmax = EigenMatrix<T>::From(*y);
auto softmax_grad = EigenMatrix<T>::From(*y_grad);
auto logits_grad = EigenMatrix<T>::From(*x_grad);
const int kBatchDim = 0;
const int kClassDim = 1;
constexpr int kBatchDim = 0;
constexpr int kClassDim = 1;
const int batch_size = softmax.dimension(kBatchDim);
const int num_classes = softmax.dimension(kClassDim);
......@@ -169,6 +169,48 @@ void SoftmaxGradFunctor<DeviceContext, T>::operator()(
logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
}
template <typename DeviceContext, typename T, typename Enable>
void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
}
template <typename DeviceContext, typename T>
class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
public:
void operator()(const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
auto out_dims = y->dims();
constexpr int kBatchDim = 0;
constexpr int kClassDim = 1;
const int num_classes = out_dims[kClassDim];
const int batch_size = out_dims[kBatchDim];
const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && platform::MayIUse(platform::avx)) {
const T* out_data = y->data<T>();
const T* out_grad = y_grad->data<T>();
T* in_grad = x_grad->data<T>();
for (int bs = 0; bs < batch_size; ++bs) {
T scalar;
vec_mul_reduce<T, platform::avx>(num_classes, out_grad, out_data,
&scalar);
scalar *= static_cast<T>(-1);
vec_add_bias<T, platform::avx>(num_classes, scalar, out_grad, in_grad);
vec_mul<T, platform::avx>(num_classes, out_data, in_grad, in_grad);
out_data += num_classes;
out_grad += num_classes;
in_grad += num_classes;
}
} else {
SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
}
}
};
} // namespace math
} // namespace operators
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册