“4adc8a7aa1d78e9c37d285007eb9e2a6e2e1e180”上不存在“...paddle/fluid/tests/unittests/test_sequence_scatter_op.py”
未验证 提交 806ba6e7 编写于 作者: L lijianshe02 提交者: GitHub

add elementwise op function and add elementwise add/sub kernels test=develop (#2020)

上级 cba5736f
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/core/context.h"
namespace paddle {
namespace lite {
namespace fluid {
template <lite::TargetType Target>
struct ForRange {
ForRange(const lite::Context<Target>& dev_ctx, size_t limit);
template <typename Function>
void operator()(Function func) const;
};
template <>
struct ForRange<lite::TargetType::kX86> {
ForRange(lite::X86Context& dev_ctx, size_t limit) : limit_(limit) {}
template <typename Function>
void operator()(Function func) const {
for (size_t i = 0; i < limit_; ++i) {
func(i);
}
}
size_t limit_;
};
} // namespace fluid
} // namespace lite
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#define HOSTDEVICE
#define DEVICE
#define HOST
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <type_traits>
#include "lite/core/op_lite.h"
#include "lite/fluid/hostdevice.h"
namespace paddle {
namespace lite {
namespace fluid {
// Transform applys a unary or a binary functor on each element in a
// range defined by a pair of iterators.
//
// - The specialization for CPU calls std::transform.
// - The specialization for CUDA calls thrust::tranform.
//
// NOTE: We need to define InputIter and OutputIter defined as
// different types, because the InputIter points op's inputs and
// OutputIter pints to op's outputs.
//
// NOTE: We don't assume that InputIter to be const InputType* and
// OutputIter to be OutputType*, because we might use a iterator
// class, paddle::fluid::operators::RowwiseTRansformIterator.
template <lite::TargetType Target>
struct Transform {
// The unary version.
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const lite::Context<Target>& context,
InputIter first,
InputIter last,
OutputIter result,
UnaryOperation op);
// The binary version.
template <typename InputIter1,
typename InputIter2,
typename OutputIter,
typename BinaryOperation>
void operator()(const lite::Context<Target>& context,
InputIter1 first1,
InputIter1 last1,
InputIter2 first2,
OutputIter result,
BinaryOperation op);
};
template <>
struct Transform<lite::TargetType::kX86> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const lite::X86Context& context,
InputIter first,
InputIter last,
OutputIter result,
UnaryOperation op) {
std::transform(first, last, result, op);
}
template <typename InputIter1,
typename InputIter2,
typename OutputIter,
typename BinaryOperation>
void operator()(const lite::X86Context& context,
InputIter1 first1,
InputIter1 last1,
InputIter2 first2,
OutputIter result,
BinaryOperation op) {
std::transform(first1, last1, first2, result, op);
}
};
} // namespace fluid
} // namespace lite
} // namespace paddle
...@@ -33,6 +33,7 @@ add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kerne ...@@ -33,6 +33,7 @@ add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kerne
add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps})
add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling) add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling)
add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax) add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
add_kernel(elementwise_compute_x86 X86 basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
if(NOT LITE_WITH_X86) if(NOT LITE_WITH_X86)
return() return()
...@@ -46,3 +47,4 @@ lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_com ...@@ -46,3 +47,4 @@ lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_com
lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86) lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86)
lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86) lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86)
lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86) lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
...@@ -35,21 +35,3 @@ REGISTER_LITE_KERNEL(elementwise_add, ...@@ -35,21 +35,3 @@ REGISTER_LITE_KERNEL(elementwise_add,
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize(); .Finalize();
#ifdef LITE_WITH_X86
REGISTER_LITE_KERNEL(
elementwise_sub_grad,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::ElementwiseSubGradCompute<float>,
def)
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput(paddle::framework::GradVarName("Out"),
{LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput(paddle::framework::GradVarName("X"),
{LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput(paddle::framework::GradVarName("Y"),
{LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
#endif
...@@ -15,11 +15,8 @@ ...@@ -15,11 +15,8 @@
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "paddle/fluid/framework/eigen.h" #include "lite/fluid/eigen.h"
#include "paddle/fluid/framework/operator.h" #include "lite/kernels/x86/elementwise_op_function.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -45,74 +42,17 @@ class ElementwiseSubCompute ...@@ -45,74 +42,17 @@ class ElementwiseSubCompute
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>(); auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.Out->template mutable_data<T>(); param.Out->template mutable_data<T>();
paddle::operators::ElementwiseComputeEx<SubFunctor<T>, paddle::lite::kernels::x86::ElementwiseComputeEx<SubFunctor<T>,
platform::CPUDeviceContext, lite::TargetType::kX86,
T>(*context.x86_execution_context(), T>(
&param.X->raw_tensor(), context, param.X, param.Y, param.axis, SubFunctor<T>(), param.Out);
&param.Y->raw_tensor(),
param.axis,
SubFunctor<T>(),
&param.Out->raw_tensor());
} }
virtual ~ElementwiseSubCompute() = default; virtual ~ElementwiseSubCompute() = default;
}; };
template <typename T>
struct SubGradDX {
T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
struct SubGradDY {
T operator()(T x, T y, T out, T dout) const { return -dout; }
};
#ifdef LITE_WITH_X86
template <typename T>
class ElementwiseSubGradCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseGradParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.X_grad->template mutable_data<T>();
// skip out, x, y
auto dout = param.Out_grad->raw_tensor();
auto dx = param.X_grad->raw_tensor();
framework::Tensor* dy = nullptr;
if (param.Y_grad) {
param.Y_grad->template mutable_data<T>();
dy = &param.Y_grad->raw_tensor();
}
auto& skip = dout;
paddle::operators::ElemwiseExplicitGradCompute<platform::CPUDeviceContext,
T,
SubGradDX<T>,
SubGradDY<T>>(
*context.x86_execution_context(),
skip,
skip,
skip,
dout,
param.axis,
&dx,
dy,
SubGradDX<T>(),
SubGradDY<T>());
}
virtual ~ElementwiseSubGradCompute() = default;
};
#endif
template <typename T> template <typename T>
class ElementwiseAddCompute class ElementwiseAddCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> { : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
...@@ -121,16 +61,11 @@ class ElementwiseAddCompute ...@@ -121,16 +61,11 @@ class ElementwiseAddCompute
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>(); auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.Out->template mutable_data<T>(); param.Out->template mutable_data<T>();
paddle::operators::ElementwiseComputeEx<AddFunctor<T>, paddle::lite::kernels::x86::ElementwiseComputeEx<AddFunctor<T>,
platform::CPUDeviceContext, lite::TargetType::kX86,
T>(*context.x86_execution_context(), T>(
&param.X->raw_tensor(), context, param.X, param.Y, param.axis, AddFunctor<T>(), param.Out);
&param.Y->raw_tensor(),
param.axis,
AddFunctor<T>(),
&param.Out->raw_tensor());
} }
virtual ~ElementwiseAddCompute() = default; virtual ~ElementwiseAddCompute() = default;
......
...@@ -74,9 +74,9 @@ TEST(elementwise_add_x86, run_test) { ...@@ -74,9 +74,9 @@ TEST(elementwise_add_x86, run_test) {
elementwise_add.SetContext(std::move(ctx)); elementwise_add.SetContext(std::move(ctx));
elementwise_add.Run(); elementwise_add.Run();
LOG(INFO) << "output: "; std::vector<float> ref_results = {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
for (int i = 0; i < out.dims().production(); i++) { for (int i = 0; i < out.dims().production(); i++) {
LOG(INFO) << out_data[i]; EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <algorithm>
#include <iterator>
#include <vector>
#include "lite/fluid/eigen.h"
#include "lite/fluid/transform.h"
#include "lite/utils/paddle_enforce.h"
#include "lite/backends/x86/math/math_function.h"
#include "lite/fluid/for_range.h"
#include "lite/utils/variant.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
/*
* Out = X ⊙ Y
* If Y's shape does not match X' shape, they will be reshaped.
* For example:
* 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
* pre=2, n=3*4, post=5
* x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
* 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
* pre=2*3, n=4*5, post=1
* x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
*
* New parameter: *mid_flag* is added to solve m*n*k & m*1*k
* broadcast cases.
* 3. shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1, 4, 5)
* mid_flag should not be NULL.
* x.shape(2, 3, 20) * y.shape(2, 1, 20).broadcast(2, 3, 20)
*/
inline void get_mid_dims(const lite::DDim &x_dims,
const lite::DDim &y_dims,
const int axis,
int *pre,
int *n,
int *post,
int *mid_flag = NULL) {
*pre = 1;
*n = 1;
*post = 1;
if (mid_flag != NULL) {
*mid_flag = 0;
int mid = 0;
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
if (x_dims[i + axis] != y_dims[i]) {
// only support single y_dims[i] = 1 now.
PADDLE_ENFORCE_EQ(
*mid_flag, 0, "Broadcast support y_dims with single 1.");
PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch.");
// m*n*k m*1*k
for (int j = 0; j < i; ++j) {
(*pre) *= y_dims[j];
}
*n = std::max(x_dims[i + axis], y_dims[i]);
*mid_flag = 1;
mid = i;
break;
}
(*n) *= y_dims[i];
}
if (*mid_flag) {
for (int i = mid + 1; i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
} else {
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
}
} else { // for fused_elementwise_activation_op. keep the old version.
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(
x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch.");
(*n) *= y_dims[i];
}
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
}
}
inline lite::DDim trim_trailing_singular_dims(const lite::DDim &dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
}
std::vector<int64_t> trim_dims;
trim_dims.resize(actual_dims_size);
for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i];
}
if (trim_dims.size() == 0) {
return lite::DDim();
}
lite::DDim actual_dims = lite::DDim(trim_dims);
return actual_dims;
}
template <typename T, lite::TargetType Target>
class RowwiseTransformIterator;
template <typename T, lite::TargetType Target>
class MidWiseTransformIterator;
// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
template <typename T>
class RowwiseTransformIterator<T, lite::TargetType::kX86>
: public std::iterator<std::random_access_iterator_tag,
T,
std::ptrdiff_t,
T *,
T &> {
public:
RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
RowwiseTransformIterator<T, lite::TargetType::kX86> &operator++() {
++i_;
if (UNLIKELY(i_ == n_)) {
i_ = 0;
}
return *this;
}
RowwiseTransformIterator<T, lite::TargetType::kX86> &operator+(int n) {
while (n-- > 0) {
++i_;
if (UNLIKELY(i_ == n_)) {
i_ = 0;
}
}
return *this;
}
bool operator==(
const RowwiseTransformIterator<T, lite::TargetType::kX86> &rhs) const {
return (ptr_ + i_) == &(*rhs);
}
bool operator!=(
const RowwiseTransformIterator<T, lite::TargetType::kX86> &rhs) const {
return (ptr_ + i_) != &(*rhs);
}
const T &operator*() { return ptr_[i_]; }
private:
const T *ptr_;
int i_;
int64_t n_;
};
template <typename T>
class MidWiseTransformIterator<T, lite::TargetType::kX86>
: public std::iterator<std::random_access_iterator_tag,
T,
std::ptrdiff_t,
T *,
T &> {
public:
MidWiseTransformIterator(const T *ptr, int n, int post)
: ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
MidWiseTransformIterator<T, lite::TargetType::kX86> &operator++() {
++j_;
if (UNLIKELY(j_ == post_)) {
++i_;
j_ = 0;
if (UNLIKELY(i_ == n_)) {
i_ = 0;
}
}
return *this;
}
MidWiseTransformIterator<T, lite::TargetType::kX86> &operator+(int n) {
while (n-- > 0) {
++j_;
if (UNLIKELY(j_ == post_)) {
++i_;
j_ = 0;
if (UNLIKELY(i_ == n_)) {
i_ = 0;
}
}
}
return *this;
}
bool operator==(
const MidWiseTransformIterator<T, lite::TargetType::kX86> &rhs) const {
return (ptr_ + i_) == &(*rhs);
}
bool operator!=(
const MidWiseTransformIterator<T, lite::TargetType::kX86> &rhs) const {
return (ptr_ + i_) != &(*rhs);
}
const T &operator*() { return ptr_[i_]; }
private:
const T *ptr_;
int64_t i_;
int64_t j_;
int64_t n_;
int64_t post_;
};
template <typename Functor,
typename T,
lite::TargetType Target,
typename OutType = T>
class TransformFunctor {
public:
TransformFunctor(const lite::Tensor *x,
const lite::Tensor *y,
lite::Tensor *z,
const lite::Context<Target> &ctx,
Functor func)
: x_(x->data<T>()),
y_(y->data<T>()),
z_(z->mutable_data<OutType>()),
nx_(x->numel()),
ctx_(ctx),
func_(func) {}
inline void Run() const {
lite::fluid::Transform<Target> trans;
trans(ctx_, x_, x_ + nx_, y_, z_, func_);
}
inline void RunRowWise(int n, int pre) const {
lite::fluid::Transform<Target> trans;
trans(ctx_,
x_,
x_ + nx_,
RowwiseTransformIterator<T, Target>(y_, n),
z_,
func_);
}
inline void RunMidWise(int n, int pre, int post) const {
lite::fluid::Transform<Target> trans;
trans(ctx_,
x_,
x_ + nx_,
MidWiseTransformIterator<T, Target>(y_, n, post),
z_,
func_);
}
inline void RunMidRowWise(int n, int pre, int post) const {
lite::fluid::Transform<Target> trans;
for (int i = 0; i < pre; i++) {
trans(ctx_,
x_ + i * n * post,
x_ + (i + 1) * n * post,
RowwiseTransformIterator<T, Target>(y_ + i * post, post),
z_ + i * n * post,
func_);
}
}
private:
const T *x_;
const T *y_;
OutType *z_;
int64_t nx_;
const lite::Context<Target> &ctx_;
Functor func_;
};
template <typename Functor,
lite::TargetType Target,
typename T,
typename OutType = T>
void ElementwiseComputeEx(const lite::Context<Target> &ctx,
const lite::Tensor *x,
const lite::Tensor *y,
int axis,
Functor func,
lite::Tensor *z) {
TransformFunctor<Functor, T, Target, OutType> functor(x, y, z, ctx, func);
auto x_dims = x->dims();
auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(),
y_dims_untrimed.size(),
"Rank of first input must >= rank of second input.");
if (x_dims == y_dims_untrimed) {
functor.Run();
return;
}
axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post, mid_flag = 0;
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &mid_flag);
if (mid_flag) {
functor.RunMidRowWise(n, pre, post);
return;
}
if (post == 1) {
functor.RunRowWise(n, pre);
return;
} else {
functor.RunMidWise(n, pre, post);
return;
}
}
// FusedElemwiseAndAct
// --- forward
template <typename T, typename CompoundFunctor, bool KeepIntermediateOut>
struct FusedElemwiseAndActNoBroadcast {
HOSTDEVICE void operator()(size_t i) {
T y_val = y_[i];
T x_val = x_[i];
if (KeepIntermediateOut) {
T intermeidiate_out = compound_functor_.GetIntermediateOut(x_val, y_val);
intermediate_out_[i] = intermeidiate_out;
out_[i] =
compound_functor_.GetOutUseIntermediateOut(x_val, intermeidiate_out);
} else {
out_[i] = compound_functor_.GetOut(x_val, y_val);
}
}
const T *x_;
const T *y_;
CompoundFunctor compound_functor_;
T *out_;
T *intermediate_out_;
};
// FusedElemwiseAndActBroadcast1:
// In this case, X and Y can be reshaped to a matrix.
// For example shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) and axis = -1 or 2,
// X can be reshaped to (6, 20) and Y can be reshaped to (1, 20)
template <typename T,
typename CompoundFunctor,
bool BcastY,
bool KeepIntermediateOut,
bool SameShapeOfIntermediateOutAndOut>
static void FusedElemwiseAndActBroadcast1CPU(const T *x,
const T *y,
CompoundFunctor compound_functor,
int h,
int w,
T *out,
T *intermediate_out) {
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int offset = i * w + j;
T y_val = BcastY ? y[j] : y[offset];
T x_val = BcastY ? x[offset] : x[j];
int64_t intermediate_out_offset;
if (KeepIntermediateOut) {
T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val);
if (SameShapeOfIntermediateOutAndOut) {
// for the case of f1(f2(x, y))
intermediate_out_offset = offset;
} else if (BcastY) {
intermediate_out_offset = j;
} else {
intermediate_out_offset = offset;
}
intermediate_out[intermediate_out_offset] = intermeidiate_out;
out[offset] =
compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out);
} else {
out[offset] = compound_functor.GetOut(x_val, y_val);
}
}
}
}
// FusedElemwiseAndActBroadcast2
// In this case, X and Y can be reshaped to a matrix.
// For example shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4) and axis = 1,
// X can be reshaped to (2, 12, 5) and Y can be reshaped to (1, 12, 1)
// pre = 2, n = 12, post = 5
template <typename T,
typename CompoundFunctor,
bool BcastY,
bool KeepIntermediateOut,
bool SameShapeOfIntermediateOutAndOut>
static void FusedElemwiseAndActBroadcast2CPU(const T *x,
const T *y,
int pre,
int n,
int post,
CompoundFunctor compound_functor,
T *out,
T *intermediate_out) {
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
for (int k = 0; k < post; ++k) {
int offset = i * n * post + j * post + k;
T y_val = BcastY ? y[j] : y[offset];
T x_val = BcastY ? x[offset] : x[j];
int64_t intermediate_out_offset;
if (KeepIntermediateOut) {
T intermeidiate_out =
compound_functor.GetIntermediateOut(x_val, y_val);
if (SameShapeOfIntermediateOutAndOut) {
// for the case of f1(f2(x, y))
intermediate_out_offset = offset;
} else if (BcastY) {
intermediate_out_offset = j;
} else {
intermediate_out_offset = offset;
}
intermediate_out[intermediate_out_offset] = intermeidiate_out;
out[offset] = compound_functor.GetOutUseIntermediateOut(
x_val, intermeidiate_out);
} else {
out[offset] = compound_functor.GetOut(x_val, y_val);
}
}
}
}
}
template <lite::TargetType Target,
typename T,
typename CompoundFunctor,
bool KeepIntermediateOut>
void FusedElemwiseAndActComputeNoBroadcast(const lite::Context<Target> &ctx,
const lite::DDim &x_dim,
const lite::Tensor &x,
const lite::Tensor &y,
CompoundFunctor compound_functor,
lite::Tensor *out,
lite::Tensor *intermediate_out) {
size_t N = static_cast<size_t>(x_dim.production());
lite::fluid::ForRange<Target> for_range(ctx, N);
for_range(
FusedElemwiseAndActNoBroadcast<T, CompoundFunctor, KeepIntermediateOut>{
x.data<T>(),
y.data<T>(),
compound_functor,
out->mutable_data<T>(),
intermediate_out == nullptr ? nullptr
: intermediate_out->mutable_data<T>()});
}
template <lite::TargetType Target,
typename T,
typename CompoundFunctor,
bool BcastY,
bool KeepIntermediateOut,
bool SameShapeOfIntermediateOutAndOut>
void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
const lite::DDim &x_dim,
const lite::DDim &y_dim_untrimed,
const lite::Tensor &x,
const lite::Tensor &y,
CompoundFunctor compound_functor,
int axis,
lite::Tensor *out,
lite::Tensor *intermediate_out) {
axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
if (post == 1) {
int h = pre;
int w = n;
FusedElemwiseAndActBroadcast1CPU<T,
CompoundFunctor,
BcastY,
KeepIntermediateOut,
SameShapeOfIntermediateOutAndOut>(
x.data<T>(),
y.data<T>(),
compound_functor,
h,
w,
out->mutable_data<T>(),
intermediate_out == nullptr ? nullptr
: intermediate_out->mutable_data<T>());
} else {
FusedElemwiseAndActBroadcast2CPU<T,
CompoundFunctor,
BcastY,
KeepIntermediateOut,
SameShapeOfIntermediateOutAndOut>(
x.data<T>(),
y.data<T>(),
pre,
n,
post,
compound_functor,
out->mutable_data<T>(),
intermediate_out == nullptr ? nullptr
: intermediate_out->mutable_data<T>());
}
}
template <lite::TargetType Target,
typename T,
typename CompoundFunctor,
bool KeepIntermediateOut,
bool SameShapeOfIntermediateOutAndOut>
void FusedElemwiseAndActComputeEx(const lite::Context<Target> &ctx,
const lite::Tensor &x,
const lite::Tensor &y,
int axis,
CompoundFunctor compound_functor,
lite::Tensor *out,
lite::Tensor *intermediate_out) {
if (KeepIntermediateOut) {
PADDLE_ENFORCE(intermediate_out,
"The save_intermediate_out is opened, "
"intermediate_out should not be nullptr.");
}
const lite::DDim &x_dim = x.dims();
const lite::DDim &y_dim = y.dims();
if (x.dims() == y.dims()) {
FusedElemwiseAndActComputeNoBroadcast<Target,
T,
CompoundFunctor,
KeepIntermediateOut>(
ctx, x_dim, x, y, compound_functor, out, intermediate_out);
} else {
// Whether the shape of Y is a continuous subsequence of X,
// For more information please refer to the op's introduction.
bool bcast_y = x.dims().size() >= y.dims().size();
if (x.dims().size() == y.dims().size()) {
for (int i = 0; i < x.dims().size(); ++i) {
if (x.dims()[i] < y.dims()[i]) {
bcast_y = false;
break;
}
}
}
// z = f1(x, f2(y))
// z = f1(f2(x, y))
if (bcast_y) { // Y should be broadcast.
// In this case,
// for 'f2(y)', the shape of intermediate_out should be equal to the
// shape
// of Y.
// for 'f2(x, y)', the shape of intermediate_out should be equal to the
// shape of Out.
// the shape of Out should be equal to the shape of X.
FusedElemwiseAndActComputeWithBroadcast<Target,
T,
CompoundFunctor,
true /*BcastY*/,
KeepIntermediateOut,
SameShapeOfIntermediateOutAndOut>(
ctx,
x_dim /*OutShape*/,
y_dim,
x,
y,
compound_functor,
axis,
out,
intermediate_out);
} else {
// In this case,
// for 'f2(y)', the shape of intermediate_out should be equal to the
// shape
// of Out.
// for 'f2(x, y)', the shape of intermediate_out should be equal to the
// shape of Out.
// the shape of Out should be equal to the shape of Y.
FusedElemwiseAndActComputeWithBroadcast<Target,
T,
CompoundFunctor,
false /*BcastY*/,
KeepIntermediateOut,
SameShapeOfIntermediateOutAndOut>(
ctx,
y_dim /*OutShape*/,
x_dim,
x,
y,
compound_functor,
axis,
out,
intermediate_out);
}
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
// will disable variadic template support in NVCC mode. Define
// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
// function symbols. For details,
// https://github.com/PaddlePaddle/Paddle/issues/3386
// some platform-independent defintion
#if defined(_WIN32)
#define UNUSED
#define __builtin_expect(EXP, C) (EXP)
#else
#define UNUSED __attribute__((unused))
#endif
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition)
#endif
#if !defined(_WIN32)
#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
#else
// there is no equivalent intrinsics in msvc.
#define LIKELY(condition) (condition)
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册