未验证 提交 dcda2023 编写于 作者: Y Yiqun Liu 提交者: GitHub

Optimize the elementwise op using eigen (#15494)

* Optimize the elementwise op with CUDA kernels.
test=develop

* Support setting of attr in op config file.
test=develop

* Add the support the setting dtype and initializer in config.
test=develop

* Save workspace.

* Add initializer "zeros".
test=develop

* Fix compiling error.

* Support the use of existed file to initailize tensor in op_tester.

* Use eigen to optimize the elementwise_add/mul for the case that x and y have the same dims.
test=develop
上级 4624d7c6
...@@ -257,7 +257,8 @@ framework::VarDesc *OpTester::Var(const std::string &name) { ...@@ -257,7 +257,8 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
template <typename T> template <typename T>
void OpTester::SetupTensor(framework::LoDTensor *tensor, void OpTester::SetupTensor(framework::LoDTensor *tensor,
const std::vector<int64_t> &shape, T lower, T upper, const std::vector<int64_t> &shape, T lower, T upper,
const std::string &initializer) { const std::string &initializer,
const std::string &filename) {
static unsigned int seed = 100; static unsigned int seed = 100;
std::mt19937 rng(seed++); std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1); std::uniform_real_distribution<double> uniform_dist(0, 1);
...@@ -280,12 +281,20 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor, ...@@ -280,12 +281,20 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
} }
} else if (initializer == "natural") { } else if (initializer == "natural") {
for (int i = 0; i < cpu_tensor.numel(); ++i) { for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = lower + i; cpu_ptr[i] = static_cast<T>(lower + i);
} }
} else if (initializer == "zeros") { } else if (initializer == "zeros") {
for (int i = 0; i < cpu_tensor.numel(); ++i) { for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = 0; cpu_ptr[i] = static_cast<T>(0);
} }
} else if (initializer == "file") {
std::ifstream is(filename);
for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
T value;
is >> value;
cpu_ptr[i] = static_cast<T>(value);
}
is.close();
} else { } else {
PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
} }
...@@ -325,15 +334,19 @@ void OpTester::CreateVariables(framework::Scope *scope) { ...@@ -325,15 +334,19 @@ void OpTester::CreateVariables(framework::Scope *scope) {
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto *tensor = var->GetMutable<framework::LoDTensor>();
const auto &data_type = var_desc->GetDataType(); const auto &data_type = var_desc->GetDataType();
if (data_type == framework::proto::VarType::INT32) { if (data_type == framework::proto::VarType::INT32) {
SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer); SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer,
item.second.filename);
} else if (data_type == framework::proto::VarType::INT64) { } else if (data_type == framework::proto::VarType::INT64) {
SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer); SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer,
item.second.filename);
} else if (data_type == framework::proto::VarType::FP32) { } else if (data_type == framework::proto::VarType::FP32) {
SetupTensor<float>(tensor, shape, static_cast<float>(0.0), SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0), item.second.initializer); static_cast<float>(1.0), item.second.initializer,
item.second.filename);
} else if (data_type == framework::proto::VarType::FP64) { } else if (data_type == framework::proto::VarType::FP64) {
SetupTensor<double>(tensor, shape, static_cast<double>(0.0), SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
static_cast<double>(1.0), item.second.initializer); static_cast<double>(1.0), item.second.initializer,
item.second.filename);
} else { } else {
PADDLE_THROW("Unsupported dtype %d.", data_type); PADDLE_THROW("Unsupported dtype %d.", data_type);
} }
......
...@@ -55,7 +55,7 @@ class OpTester { ...@@ -55,7 +55,7 @@ class OpTester {
template <typename T> template <typename T>
void SetupTensor(framework::LoDTensor *input, void SetupTensor(framework::LoDTensor *input,
const std::vector<int64_t> &shape, T lower, T upper, const std::vector<int64_t> &shape, T lower, T upper,
const std::string &initializer); const std::string &initializer, const std::string &filename);
void RunImpl(); void RunImpl();
......
...@@ -56,6 +56,9 @@ OpInputConfig::OpInputConfig(std::istream& is) { ...@@ -56,6 +56,9 @@ OpInputConfig::OpInputConfig(std::istream& is) {
ParseDims(is); ParseDims(is);
} else if (sep == "lod" || sep == "lod:") { } else if (sep == "lod" || sep == "lod:") {
ParseLoD(is); ParseLoD(is);
} else if (sep == "filename") {
is >> filename;
EraseEndSep(&filename);
} }
} }
} }
...@@ -86,7 +89,7 @@ void OpInputConfig::ParseInitializer(std::istream& is) { ...@@ -86,7 +89,7 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
EraseEndSep(&initializer_str); EraseEndSep(&initializer_str);
const std::vector<std::string> supported_initializers = {"random", "natural", const std::vector<std::string> supported_initializers = {"random", "natural",
"zeros"}; "zeros", "file"};
if (!Has(supported_initializers, initializer_str)) { if (!Has(supported_initializers, initializer_str)) {
PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
} }
......
...@@ -35,7 +35,8 @@ struct OpInputConfig { ...@@ -35,7 +35,8 @@ struct OpInputConfig {
std::string name; std::string name;
std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double
std::string initializer{"random"}; // random, natural std::string initializer{"random"}; // random, natural, zeros, file
std::string filename{""};
std::vector<int64_t> dims; std::vector<int64_t> dims;
std::vector<std::vector<size_t>> lod; std::vector<std::vector<size_t>> lod;
}; };
......
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
......
...@@ -40,25 +40,26 @@ template <typename DeviceContext, typename T> ...@@ -40,25 +40,26 @@ template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
std::is_floating_point<T>::value && std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add(const framework::ExecutionContext &ctx, elementwise_add_same_dims(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y, const framework::Tensor *x,
framework::Tensor *z) { const framework::Tensor *y, framework::Tensor *z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data()); blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
!std::is_floating_point<T>::value || !std::is_floating_point<T>::value ||
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add(const framework::ExecutionContext &ctx, elementwise_add_same_dims(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y, const framework::Tensor *x,
framework::Tensor *z) { const framework::Tensor *y, framework::Tensor *z) {
default_elementwise_add<DeviceContext, T>(ctx, x, y, z); auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_z.device(place) = eigen_x + eigen_y;
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -73,7 +74,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> { ...@@ -73,7 +74,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
auto dims_equal = x->dims() == y->dims(); auto dims_equal = x->dims() == y->dims();
if (dims_equal) { if (dims_equal) {
elementwise_add<DeviceContext, T>(ctx, x, y, z); elementwise_add_same_dims<DeviceContext, T>(ctx, x, y, z);
} else { } else {
default_elementwise_add<DeviceContext, T>(ctx, x, y, z); default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
} }
......
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
......
...@@ -38,22 +38,26 @@ template <typename DeviceContext, typename T> ...@@ -38,22 +38,26 @@ template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
std::is_floating_point<T>::value && std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_mul(const framework::ExecutionContext& ctx, elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y, const framework::Tensor* x,
framework::Tensor* z) { const framework::Tensor* y, framework::Tensor* z) {
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
z->mutable_data<T>(ctx.GetPlace()));
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
!std::is_floating_point<T>::value || !std::is_floating_point<T>::value ||
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_mul(const framework::ExecutionContext& ctx, elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y, const framework::Tensor* x,
framework::Tensor* z) { const framework::Tensor* y, framework::Tensor* z) {
default_elementwise_mul<DeviceContext, T>(ctx, x, y, z); auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_z.device(place) = eigen_x * eigen_y;
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -88,7 +92,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> { ...@@ -88,7 +92,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
if (x.numel() == y->numel()) { if (x.numel() == y->numel()) {
elementwise_mul<DeviceContext, T>(ctx, &x, y, z); elementwise_mul_same_dims<DeviceContext, T>(ctx, &x, y, z);
} else { } else {
default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z); default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册