未验证 提交 8015fbd6 编写于 作者: Y YuanRisheng 提交者: GitHub

[Cherry-pick]Move sum op to PHI && Fix MetaTensor's bug when run infermeta (#49342)

* cherry-pick 45860

* [BUG FIX]Fix MetaTensor's bug when run infermeta (#46265)

* fix sum bug

* fix ci bugs

* fix ci bugs

* update code according comment
上级 8aa5be90
...@@ -87,6 +87,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { ...@@ -87,6 +87,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
}); });
} }
bool IsSelectedRowsInputs(const std::string& name) const override {
auto var_types = ctx_.GetInputsVarType(name);
return std::all_of(var_types.begin(),
var_types.end(),
[](const proto::VarType::Type& type) {
return type == proto::VarType::SELECTED_ROWS;
});
}
bool IsSelectedRowsInput(const std::string& name) const override { bool IsSelectedRowsInput(const std::string& name) const override {
auto var_type = ctx_.GetInputVarType(name); auto var_type = ctx_.GetInputVarType(name);
return var_type == proto::VarType::SELECTED_ROWS; return var_type == proto::VarType::SELECTED_ROWS;
...@@ -155,6 +164,16 @@ int64_t CompatMetaTensor::numel() const { ...@@ -155,6 +164,16 @@ int64_t CompatMetaTensor::numel() const {
} }
} }
bool CompatMetaTensor::is_selected_rows() const {
if (is_runtime_) {
auto* var = PADDLE_GET_CONST(Variable*, var_);
return var->IsType<phi::SelectedRows>();
} else {
auto* var = PADDLE_GET_CONST(VarDesc*, var_);
return var->GetType() == proto::VarType::SELECTED_ROWS;
}
}
bool CompatMetaTensor::is_dense() const { bool CompatMetaTensor::is_dense() const {
if (is_runtime_) { if (is_runtime_) {
auto* var = PADDLE_GET_CONST(Variable*, var_); auto* var = PADDLE_GET_CONST(Variable*, var_);
...@@ -182,7 +201,7 @@ DDim CompatMetaTensor::dims() const { ...@@ -182,7 +201,7 @@ DDim CompatMetaTensor::dims() const {
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
return var->Get<phi::DenseTensor>().dims(); return var->Get<phi::DenseTensor>().dims();
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
return var->Get<phi::SelectedRows>().dims(); return var->Get<phi::SelectedRows>().GetCompleteDims();
} else if (var->IsType<phi::SparseCooTensor>()) { } else if (var->IsType<phi::SparseCooTensor>()) {
return var->Get<phi::SparseCooTensor>().dims(); return var->Get<phi::SparseCooTensor>().dims();
} else if (var->IsType<framework::LoDTensorArray>()) { } else if (var->IsType<framework::LoDTensorArray>()) {
...@@ -260,8 +279,7 @@ void CompatMetaTensor::set_dims(const DDim& dims) { ...@@ -260,8 +279,7 @@ void CompatMetaTensor::set_dims(const DDim& dims) {
auto* tensor = var->GetMutable<phi::DenseTensor>(); auto* tensor = var->GetMutable<phi::DenseTensor>();
phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value(); var->GetMutable<phi::SelectedRows>()->set_height(dims[0]);
phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
} else if (var->IsType<phi::SparseCooTensor>()) { } else if (var->IsType<phi::SparseCooTensor>()) {
auto* tensor = var->GetMutable<phi::SparseCooTensor>(); auto* tensor = var->GetMutable<phi::SparseCooTensor>();
phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
......
...@@ -59,6 +59,8 @@ class CompatMetaTensor : public phi::MetaTensor { ...@@ -59,6 +59,8 @@ class CompatMetaTensor : public phi::MetaTensor {
bool initialized() const override { return initialized_; }; bool initialized() const override { return initialized_; };
bool is_selected_rows() const;
bool is_tensor_array() const; bool is_tensor_array() const;
bool is_dense() const; bool is_dense() const;
......
...@@ -50,7 +50,7 @@ USE_OP_ITSELF(concat_grad); ...@@ -50,7 +50,7 @@ USE_OP_ITSELF(concat_grad);
USE_OP_ITSELF(elementwise_mul_grad); USE_OP_ITSELF(elementwise_mul_grad);
USE_OP_ITSELF(sigmoid_grad); USE_OP_ITSELF(sigmoid_grad);
USE_OP_ITSELF(tanh_grad); USE_OP_ITSELF(tanh_grad);
USE_OP(sum); USE_OP_ITSELF(sum);
USE_OP_ITSELF(slice_grad); USE_OP_ITSELF(slice_grad);
USE_OP_ITSELF(lookup_table_grad); USE_OP_ITSELF(lookup_table_grad);
USE_OP_ITSELF(sqrt); USE_OP_ITSELF(sqrt);
...@@ -101,6 +101,7 @@ PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT); ...@@ -101,6 +101,7 @@ PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_n, GPU, ALL_LAYOUT);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -512,6 +512,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { ...@@ -512,6 +512,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
}); });
} }
bool IsSelectedRowsInputs(const std::string& name) const override {
auto vars = ctx_.MultiInputVar(name);
return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
return var->IsType<phi::SelectedRows>();
});
}
bool IsSelectedRowsInput(const std::string& name) const override { bool IsSelectedRowsInput(const std::string& name) const override {
const auto* var = ctx_.InputVar(name); const auto* var = ctx_.InputVar(name);
return var->IsType<phi::SelectedRows>(); return var->IsType<phi::SelectedRows>();
......
...@@ -104,6 +104,7 @@ bool PluginArgumentMappingContext::IsSelectedRowsInput( ...@@ -104,6 +104,7 @@ bool PluginArgumentMappingContext::IsSelectedRowsInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
} }
bool PluginArgumentMappingContext::IsSparseCooTensorInput( bool PluginArgumentMappingContext::IsSparseCooTensorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
...@@ -112,6 +113,11 @@ bool PluginArgumentMappingContext::IsSparseCsrTensorInput( ...@@ -112,6 +113,11 @@ bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
} }
bool PluginArgumentMappingContext::IsSelectedRowsInputs(
const std::string& name) const {
return false;
}
bool PluginArgumentMappingContext::IsDenseTensorVectorInput( bool PluginArgumentMappingContext::IsDenseTensorVectorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
......
...@@ -50,6 +50,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext { ...@@ -50,6 +50,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
bool IsSparseCsrTensorInput(const std::string& name) const override; bool IsSparseCsrTensorInput(const std::string& name) const override;
bool IsSelectedRowsInputs(const std::string& name) const override;
bool IsDenseTensorVectorInput(const std::string& name) const override; bool IsDenseTensorVectorInput(const std::string& name) const override;
bool IsDenseTensorOutput(const std::string& name) const override; bool IsDenseTensorOutput(const std::string& name) const override;
......
...@@ -24,7 +24,8 @@ ...@@ -24,7 +24,8 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
namespace phi { namespace phi {
...@@ -37,6 +38,9 @@ namespace operators { ...@@ -37,6 +38,9 @@ namespace operators {
using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNDeviceContext;
using phi::CPUContext; using phi::CPUContext;
using platform::to_void_cast; using platform::to_void_cast;
using Tensor = framework::Tensor;
using SelectedRows = phi::SelectedRows;
using LoDTensor = framework::LoDTensor;
template <typename T> template <typename T>
class SumMKLDNNHandler class SumMKLDNNHandler
......
...@@ -9,15 +9,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,15 +9,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -32,94 +34,6 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -32,94 +34,6 @@ class SumOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "sum");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sum");
if (ctx->IsRuntime() && ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR_ARRAY) {
return; // skip runtime infershape when is tensor array;
}
auto x_var_types = ctx->GetInputsVarType("X");
auto x_dims = ctx->GetInputsDim("X");
auto N = x_dims.size();
PADDLE_ENFORCE_GT(
N,
0,
platform::errors::InvalidArgument(
"The input tensor X's dimensions of SumOp "
"should be larger than 0. But received X's dimensions %d, "
"X's shape = [%s].",
N,
&x_dims));
if (N == 1) {
VLOG(3) << "Warning: SumOp have only one input, may waste memory";
}
framework::DDim in_dim({0});
for (size_t i = 0; i < x_dims.size(); ++i) {
auto& x_dim = x_dims[i];
// x_dim.size() == 1 means the real dim of selected rows is [0]
if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS &&
x_dim.size() == 1) {
continue;
}
if (phi::product(x_dim) == 0) {
continue;
}
if (phi::product(in_dim) == 0) {
in_dim = x_dim;
} else {
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(in_dim,
x_dim,
platform::errors::InvalidArgument(
"The input tensor X of SumOp must"
" have same shape. But received X[0]'s shape = "
"[%s], X[%d]'s shape = [%s].",
in_dim,
i,
x_dim));
} else {
PADDLE_ENFORCE_EQ(
in_dim.size(),
x_dim.size(),
platform::errors::InvalidArgument(
"The input tensor X of SumOp must have same "
"dimensions. But received X[0]'s dimensions = %d, X[0]'s "
"shape = "
"[%s], X[%d]'s dimensions = %d, X[%d]'s shape = [%s].",
in_dim.size(),
in_dim,
i,
x_dim.size(),
i,
x_dim));
// if in_dim or x_dim has -1, not check equal
for (int j = 0; j < x_dim.size(); ++j) {
if (x_dim[j] == -1 || in_dim[j] == -1) {
continue;
}
PADDLE_ENFORCE_EQ(
in_dim[j],
x_dim[j],
platform::errors::InvalidArgument(
"The input tensor X of SumOp must have same shape "
"if not -1."
"But received X[0]'s shape = [%s], X[%d]'s shape = [%s].",
in_dim,
i,
x_dim));
}
}
}
}
ctx->SetOutputDim("Out", in_dim);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -350,18 +264,16 @@ DECLARE_INPLACE_OP_INFERER(SumInplaceInferer, {"X", "Out"}); ...@@ -350,18 +264,16 @@ DECLARE_INPLACE_OP_INFERER(SumInplaceInferer, {"X", "Out"});
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(sum,
AddNInferShapeFunctor,
PD_INFER_META(phi::AddNTensorArrayInferMeta));
REGISTER_OPERATOR(sum, REGISTER_OPERATOR(sum,
ops::SumOp, ops::SumOp,
ops::SumOpMaker, ops::SumOpMaker,
ops::SumGradDescMaker, ops::SumGradDescMaker,
ops::SumGradOpBaseMaker, ops::SumGradOpBaseMaker,
ops::SumOpVarTypeInference, ops::SumOpVarTypeInference,
ops::SumInplaceInferer); ops::SumInplaceInferer,
AddNInferShapeFunctor);
REGISTER_OP_CPU_KERNEL(
sum,
ops::SumKernel<phi::CPUContext, float>,
ops::SumKernel<phi::CPUContext, double>,
ops::SumKernel<phi::CPUContext, int>,
ops::SumKernel<phi::CPUContext, paddle::platform::bfloat16>,
ops::SumKernel<phi::CPUContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/platform/device_context.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace plat = paddle::platform;
namespace paddle {
namespace operators {
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
using LoDTensor = framework::LoDTensor;
template <class T>
__global__ void Sum2CUDAKernel(const T *in_0,
const T *in_1,
T *out,
int64_t N) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
out[id] = in_0[id] + in_1[id];
id += blockDim.x * gridDim.x;
}
}
template <class T>
__global__ void SumArrayCUDAKernel(
T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
T total(read_dst ? out[id] : static_cast<T>(0));
for (int i = 0; i < in_size; ++i) {
const T *tmp = in[i];
if (tmp) {
total += tmp[id];
}
}
out[id] = total;
id += blockDim.x * gridDim.x;
}
}
template <class T>
__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
int64_t N,
size_t rows) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
for (int i = 0; i < 2 * rows; i += 2) {
const T *tmp = sr_in_out[i];
T *tmp_out = sr_in_out[i + 1];
if (tmp && tmp_out) {
tmp_out[id] += tmp[id];
}
}
id += blockDim.x * gridDim.x;
}
}
template <class T>
void SumToLoDTensor(const framework::ExecutionContext &context) {
auto in_vars = context.MultiInputVar("X");
const size_t in_num = in_vars.size();
constexpr size_t theory_sm_threads = 1024;
auto &dev_ctx = context.template device_context<phi::GPUContext>();
auto stream = dev_ctx.stream();
auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
auto sm_count = max_threads / theory_sm_threads;
size_t tile_size = 0;
dim3 grids;
dim3 blocks;
auto ComputeKernelParameter = [&](size_t length) {
if (length >= max_threads)
tile_size = 1024;
else if (length < max_threads && length > sm_count * 128)
tile_size = 512;
else if (length <= sm_count * 128)
tile_size = 256;
grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
blocks = dim3(tile_size, 1, 1);
};
auto *out = context.Output<LoDTensor>("Out");
bool in_place = in_vars[0] == context.OutputVar("Out");
if (!in_place) {
auto *out_ptr = out->mutable_data<T>(context.GetPlace());
if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>()) {
auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
if (in_0_tensor.numel() > 0) {
in_place = (in_0_tensor.data<T>() == out_ptr);
}
}
}
// Sum of two tensors
if (in_num == 2 && in_vars[0]->IsType<framework::LoDTensor>() &&
in_vars[1]->IsType<framework::LoDTensor>()) {
auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
int64_t length_0 = in_0.numel();
int64_t length_1 = in_1.numel();
if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e;
} else if (length_0 && in_0.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_0);
} else if (length_1 && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_1);
}
return;
}
int start = in_place ? 1 : 0;
if (!in_place) {
phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
constant_functor(context.template device_context<phi::GPUContext>(),
out,
static_cast<T>(0));
}
std::vector<const T *> in_data;
std::vector<int> selectrow_index;
int64_t lod_length = 0;
bool dst_write = false;
for (int i = start; i < in_num; ++i) {
if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_i = in_vars[i]->Get<framework::LoDTensor>();
lod_length = in_i.numel();
if (lod_length && in_i.IsInitialized()) {
in_data.emplace_back(in_i.data<T>());
}
} else if (in_vars[i]->IsType<phi::SelectedRows>()) {
selectrow_index.push_back(i);
}
}
// compute select rows separately.
if (!selectrow_index.empty()) {
std::vector<const T *> sr_in_out_data;
size_t rows = 0;
int64_t length = 0;
for (auto index : selectrow_index) {
auto &sr = in_vars[index]->Get<phi::SelectedRows>();
auto &sr_value = sr.value();
auto &sr_rows = sr.rows();
auto row_numel = sr_value.numel() / sr_rows.size();
auto out_dims = out->dims();
PADDLE_ENFORCE_EQ(sr.height(),
out_dims[0],
platform::errors::InvalidArgument(
"The table height of input must be same as output, "
"but received input height is %d"
", output height is %d",
sr.height(),
out_dims[0]));
PADDLE_ENFORCE_EQ(row_numel,
out->numel() / sr.height(),
platform::errors::InvalidArgument(
"The table width of input must be same as output, "
"but received input width is %d"
", output width is %d",
row_numel,
out->numel() / sr.height()));
auto *sr_data = sr_value.data<T>();
auto *sr_out_data = out->data<T>();
rows += sr_rows.size();
length = row_numel;
for (size_t i = 0; i < sr_rows.size(); ++i) {
sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
}
}
if (!sr_in_out_data.empty()) {
auto tmp_sr_in_out_array = memory::Alloc(
dev_ctx.GetPlace(),
sr_in_out_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(),
platform::CPUPlace(),
reinterpret_cast<void *>(sr_in_out_data.data()),
sr_in_out_data.size() * sizeof(T *),
dev_ctx.stream());
T **sr_in_out_array_data =
reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
ComputeKernelParameter(length);
SumSelectedRowsCUDAKernel<T>
<<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
dst_write = true;
}
}
// if indata not null, merge into one kernel call.
if (!in_data.empty()) {
auto tmp_in_array = memory::Alloc(
dev_ctx.GetPlace(),
in_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(),
platform::CPUPlace(),
reinterpret_cast<void *>(in_data.data()),
in_data.size() * sizeof(T *),
dev_ctx.stream());
T **in_array_data = reinterpret_cast<T **>(tmp_in_array->ptr());
ComputeKernelParameter(lod_length);
SumArrayCUDAKernel<T><<<grids, blocks, 0, stream>>>(in_array_data,
out->data<T>(),
lod_length,
in_data.size(),
dst_write | in_place);
}
}
template <typename T>
class SumKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto out_var = context.OutputVar("Out");
if (out_var->IsType<framework::LoDTensor>()) {
SumToLoDTensor<T>(context);
} else if (out_var->IsType<phi::SelectedRows>()) {
SelectedRowsCompute<phi::GPUContext, T>(context);
} else if (out_var->IsType<framework::LoDTensorArray>()) {
LodTensorArrayCompute<phi::GPUContext, T>(context);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of Output(out) must be Tensor, SelectedRows or "
"LodTensorArray. But got "
"unsupport type: %s.",
framework::ToTypeName(out_var->Type())));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(sum,
ops::SumKernel<phi::GPUContext, float>,
ops::SumKernel<phi::GPUContext, double>,
ops::SumKernel<phi::GPUContext, int>,
ops::SumKernel<phi::GPUContext, int64_t>,
ops::SumKernel<phi::GPUContext, plat::float16>,
ops::SumKernel<phi::GPUContext, plat::bfloat16>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using SelectedRows = phi::SelectedRows;
using LoDTensor = framework::LoDTensor;
template <typename T,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
void SelectedRowsCompute(const framework::ExecutionContext &context) {
auto in_vars = context.MultiInputVar("X");
auto out_var = context.OutputVar("Out");
bool in_place = out_var == in_vars[0];
if (in_place && in_vars.size() < 2) {
return;
}
std::vector<const phi::SelectedRows *> inputs;
SelectedRows temp_in0;
if (in_place) {
auto &in0 = in_vars[0]->Get<phi::SelectedRows>();
temp_in0.set_height(in0.height());
temp_in0.set_rows(in0.rows());
framework::TensorCopy(in0.value(),
in0.place(),
context.device_context(),
temp_in0.mutable_value());
inputs.push_back(&temp_in0);
for (size_t i = 1; i < in_vars.size(); ++i) {
auto &in = in_vars[i]->Get<phi::SelectedRows>();
if (in.rows().size() > 0) {
inputs.push_back(&in);
}
}
} else {
for (auto &in_var : in_vars) {
auto &in = in_var->Get<phi::SelectedRows>();
if (in.rows().size() > 0) {
inputs.push_back(&in_var->Get<phi::SelectedRows>());
}
}
}
auto *out = context.Output<phi::SelectedRows>("Out");
out->mutable_rows()->clear();
bool has_data = false;
for (auto &in : inputs) {
if (in->rows().size() > 0) {
has_data = true;
break;
}
}
if (has_data) {
math::scatter::MergeAdd<DeviceContext, T> merge_add;
merge_add(context.template device_context<DeviceContext>(), inputs, out);
out->SyncIndex();
} else {
// no data, just set a empty out tensor.
out->mutable_value()->mutable_data<T>(phi::make_ddim({0}),
context.GetPlace());
}
}
template <typename DeviceContext, typename T>
void LodTensorArrayCompute(const framework::ExecutionContext &context) {
auto in_vars = context.MultiInputVar("X");
auto out_var = context.OutputVar("Out");
bool in_place = out_var == in_vars[0];
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(),
true,
platform::errors::InvalidArgument(
"Only support all inputs are TensorArray, "
"but inputs[%d] is not TensorArray.",
i));
auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < in_array.size(); ++i) {
if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
if (i >= out_array.size()) {
out_array.resize(i + 1);
}
if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
framework::TensorCopy(in_array[i],
in_array[i].place(),
context.device_context(),
&out_array[i]);
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE_EQ(
out_array[i].lod(),
in_array[i].lod(),
platform::errors::InvalidArgument(
"The lod message between inputs[%d] and"
" outputs[%d] must be same, but now is not same.",
i,
i));
auto in = EigenVector<T>::Flatten(in_array[i]);
auto result = EigenVector<T>::Flatten(out_array[i]);
result.device(*context.template device_context<DeviceContext>()
.eigen_device()) = result + in;
}
}
}
}
}
template <typename DeviceContext, typename T>
class SumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
VLOG(10) << "start sum kernel";
auto in_vars = context.MultiInputVar("X");
size_t in_num = in_vars.size();
auto out_var = context.OutputVar("Out");
bool in_place = out_var == in_vars[0];
if (out_var->IsType<framework::LoDTensor>()) {
auto *out = out_var->GetMutable<framework::LoDTensor>();
auto *out_ptr = out->mutable_data<T>(context.GetPlace());
if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>() &&
in_vars[0]->Get<framework::LoDTensor>().IsInitialized()) {
auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
if (in_0_tensor.numel() > 0) {
in_place = (in_0_tensor.data<T>() == out_ptr);
}
}
auto result = EigenVector<T>::Flatten(*out);
auto &place =
*context.template device_context<DeviceContext>().eigen_device();
int start = in_place ? 1 : 0;
if (!in_place) {
if ((in_num >= 2) && in_vars[0]->IsType<framework::LoDTensor>() &&
in_vars[1]->IsType<framework::LoDTensor>() &&
in_vars[0]->Get<framework::LoDTensor>().IsInitialized() &&
in_vars[1]->Get<framework::LoDTensor>().IsInitialized()) {
auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
if (in_0.numel() && in_1.numel()) {
auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e;
start = 2;
}
}
if (start != 2) {
VLOG(10) << "Fill with constant = 0 in sum kernel.";
phi::funcs::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context.template device_context<DeviceContext>(),
out,
static_cast<T>(0));
}
}
math::SelectedRowsAddToTensor<DeviceContext, T> functor;
// If in_place, just skip the first tensor
for (size_t i = start; i < in_num; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (!in_t.IsInitialized() || in_t.numel() == 0) {
continue;
}
auto in = EigenVector<T>::Flatten(in_t);
result.device(place) = result + in;
} else if (in_vars[i]->IsType<phi::SelectedRows>()) {
auto &in_t = in_vars[i]->Get<phi::SelectedRows>();
functor(context.template device_context<DeviceContext>(), in_t, out);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of Input(X) of %d-th must be Tensor, "
"SelectedRows. But got "
"unsupport type: %s.",
framework::ToTypeName(in_vars[i]->Type())));
}
}
} else if (out_var->IsType<phi::SelectedRows>()) {
SelectedRowsCompute<DeviceContext, T>(context);
} else if (out_var->IsType<framework::LoDTensorArray>()) {
LodTensorArrayCompute<DeviceContext, T>(context);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of Output(out) must be Tensor, SelectedRows, "
"LoDTensorArray. But got "
"unsupport type: %s.",
framework::ToTypeName(out_var->Type())));
}
VLOG(10) << "end sum kernel";
}
};
} // namespace operators
} // namespace paddle
...@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/sum_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using SelectedRows = phi::SelectedRows;
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SumMLUKernel : public framework::OpKernel<T> { class SumMLUKernel : public framework::OpKernel<T> {
......
...@@ -16,13 +16,16 @@ limitations under the License. */ ...@@ -16,13 +16,16 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using SelectedRows = phi::SelectedRows;
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SumNPUKernel : public framework::OpKernel<T> { class SumNPUKernel : public framework::OpKernel<T> {
......
...@@ -13,14 +13,16 @@ limitations under the License. */ ...@@ -13,14 +13,16 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
using SelectedRows = phi::SelectedRows;
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SumXPUKernel : public framework::OpKernel<T> { class SumXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type; using XPUType = typename XPUTypeTrait<T>::Type;
......
...@@ -69,10 +69,16 @@ bool ProtoArgumentMappingContext::IsDenseTensorInputs( ...@@ -69,10 +69,16 @@ bool ProtoArgumentMappingContext::IsDenseTensorInputs(
return true; return true;
} }
bool ProtoArgumentMappingContext::IsSelectedRowsInputs(
const std::string& name) const {
return false;
}
bool ProtoArgumentMappingContext::IsSelectedRowsInput( bool ProtoArgumentMappingContext::IsSelectedRowsInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
} }
bool ProtoArgumentMappingContext::IsDenseTensorVectorInput( bool ProtoArgumentMappingContext::IsDenseTensorVectorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
......
...@@ -45,6 +45,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext { ...@@ -45,6 +45,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
bool IsDenseTensorInput(const std::string& name) const override; bool IsDenseTensorInput(const std::string& name) const override;
bool IsDenseTensorInputs(const std::string& name) const override; bool IsDenseTensorInputs(const std::string& name) const override;
bool IsSelectedRowsInput(const std::string& name) const override; bool IsSelectedRowsInput(const std::string& name) const override;
bool IsSelectedRowsInputs(const std::string& name) const override;
bool IsDenseTensorVectorInput(const std::string& name) const override; bool IsDenseTensorVectorInput(const std::string& name) const override;
bool IsDenseTensorOutput(const std::string& name) const override; bool IsDenseTensorOutput(const std::string& name) const override;
......
...@@ -34,6 +34,95 @@ namespace experimental { ...@@ -34,6 +34,95 @@ namespace experimental {
////////////////// Forward api impls ////////////////////// ////////////////// Forward api impls //////////////////////
Tensor add_n_impl(const std::vector<Tensor>& x) {
Backend kernel_backend = Backend::UNDEFINED;
DataLayout kernel_layout = DataLayout::UNDEFINED;
DataType kernel_data_type = DataType::UNDEFINED;
if (kernel_backend == Backend::UNDEFINED ||
kernel_layout == DataLayout::UNDEFINED ||
kernel_data_type == DataType::UNDEFINED) {
auto kernel_key_set = ParseKernelKeyByInputArgs(x);
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
if (kernel_backend == Backend::UNDEFINED) {
kernel_backend = kernel_key.backend();
}
if (kernel_layout == DataLayout::UNDEFINED) {
kernel_layout = kernel_key.layout();
}
if (kernel_data_type == DataType::UNDEFINED) {
kernel_data_type = kernel_key.dtype();
}
}
bool is_sr_kernel = true;
for (auto& input : x) {
if (phi::DenseTensor::classof(input.impl().get())) {
is_sr_kernel = false;
break;
}
}
const std::string kernel_name = (is_sr_kernel ? "add_n_sr" : "add_n");
VLOG(6) << "add_n API kernel key: [" << kernel_backend << ", "
<< kernel_layout << ", " << kernel_data_type << "]";
auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
kernel_name, {kernel_backend, kernel_layout, kernel_data_type});
const auto& kernel = kernel_result.kernel;
VLOG(6) << kernel_name << " kernel: " << kernel;
auto* dev_ctx = GetDeviceContextByBackend(
kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
Tensor api_output;
if (is_sr_kernel) {
std::vector<const phi::SelectedRows*> input_x(x.size());
for (size_t i = 0; i < input_x.size(); ++i) {
input_x[i] = static_cast<phi::SelectedRows*>(x[i].impl().get());
}
auto x_meta_vec = MakeMetaTensor(input_x);
std::vector<const phi::MetaTensor*> x_metas(x_meta_vec.size());
for (size_t i = 0; i < x_meta_vec.size(); ++i) {
x_metas[i] = &x_meta_vec[i];
}
auto kernel_out = SetSelectedRowsKernelOutput(&api_output);
phi::MetaTensor meta_out(kernel_out);
phi::AddNInferMeta(x_metas, &meta_out);
using kernel_signature =
void (*)(const platform::DeviceContext&,
const std::vector<const phi::SelectedRows*>&,
phi::SelectedRows*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx, input_x, kernel_out);
} else {
std::vector<const phi::TensorBase*> input_x(x.size());
for (size_t i = 0; i < input_x.size(); ++i) {
input_x[i] = x[i].impl().get();
}
auto x_meta_vec = MakeMetaTensor(input_x);
std::vector<const phi::MetaTensor*> x_metas(x_meta_vec.size());
for (size_t i = 0; i < x_meta_vec.size(); ++i) {
x_metas[i] = &x_meta_vec[i];
}
auto kernel_out = SetKernelOutput(&api_output);
phi::MetaTensor meta_out(kernel_out);
phi::AddNInferMeta(x_metas, &meta_out);
using kernel_signature =
void (*)(const platform::DeviceContext&,
const std::vector<const phi::TensorBase*>&,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx, input_x, kernel_out);
}
return api_output;
}
Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
Tensor out; Tensor out;
copy(x, place, blocking, &out); copy(x, place, blocking, &out);
......
...@@ -31,6 +31,8 @@ namespace experimental { ...@@ -31,6 +31,8 @@ namespace experimental {
////////////////// Forward api impls ////////////////////// ////////////////// Forward api impls //////////////////////
Tensor add_n_impl(const std::vector<Tensor>& x);
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl( std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
const Tensor& x, const Tensor& x,
const Tensor& scale, const Tensor& scale,
......
...@@ -98,6 +98,16 @@ phi::MetaTensor MakeMetaTensor(const phi::TensorBase& tensor) { ...@@ -98,6 +98,16 @@ phi::MetaTensor MakeMetaTensor(const phi::TensorBase& tensor) {
return phi::MetaTensor(tensor); return phi::MetaTensor(tensor);
} }
std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<const phi::TensorBase*>& tensors) {
std::vector<phi::MetaTensor> meta_tensors;
meta_tensors.reserve(tensors.size());
for (const auto* t : tensors) {
meta_tensors.emplace_back(*t);
}
return meta_tensors;
}
phi::MetaTensor MakeMetaTensor( phi::MetaTensor MakeMetaTensor(
const paddle::optional<phi::DenseTensor>& tensor) { const paddle::optional<phi::DenseTensor>& tensor) {
if (tensor) { if (tensor) {
...@@ -116,6 +126,16 @@ std::vector<phi::MetaTensor> MakeMetaTensor( ...@@ -116,6 +126,16 @@ std::vector<phi::MetaTensor> MakeMetaTensor(
return meta_tensors; return meta_tensors;
} }
std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<const phi::SelectedRows*>& tensors) {
std::vector<phi::MetaTensor> meta_tensors;
meta_tensors.reserve(tensors.size());
for (const auto* t : tensors) {
meta_tensors.emplace_back(*t);
}
return meta_tensors;
}
std::vector<phi::MetaTensor> MakeMetaTensor( std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<phi::DenseTensor*>& tensors) { const std::vector<phi::DenseTensor*>& tensors) {
std::vector<phi::MetaTensor> meta_tensors; std::vector<phi::MetaTensor> meta_tensors;
......
...@@ -67,6 +67,9 @@ std::vector<phi::MetaTensor> MakeMetaTensor( ...@@ -67,6 +67,9 @@ std::vector<phi::MetaTensor> MakeMetaTensor(
std::vector<phi::MetaTensor> MakeMetaTensor( std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<phi::DenseTensor*>& tensors); const std::vector<phi::DenseTensor*>& tensors);
std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<const phi::SelectedRows*>& tensors);
phi::MetaTensor MakeMetaTensor( phi::MetaTensor MakeMetaTensor(
const paddle::optional<phi::SelectedRows>& tensor); const paddle::optional<phi::SelectedRows>& tensor);
...@@ -79,6 +82,9 @@ phi::MetaTensor MakeMetaTensor( ...@@ -79,6 +82,9 @@ phi::MetaTensor MakeMetaTensor(
std::vector<phi::MetaTensor> MakeMetaTensor( std::vector<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<std::vector<const phi::DenseTensor*>>& tensors); const paddle::optional<std::vector<const phi::DenseTensor*>>& tensors);
std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<const phi::TensorBase*>& tensors);
/* ------------------ for output ----------------------- */ /* ------------------ for output ----------------------- */
phi::DenseTensor* SetKernelOutput(Tensor* out); phi::DenseTensor* SetKernelOutput(Tensor* out);
......
...@@ -102,10 +102,7 @@ ...@@ -102,10 +102,7 @@
- op : add_n - op : add_n
args : (Tensor[] x) args : (Tensor[] x)
output : Tensor output : Tensor
infer_meta : invoke : add_n_impl(x)
func : AddNInferMeta
kernel :
func : add_n
backward : add_n_grad backward : add_n_grad
- op : addmm - op : addmm
......
...@@ -108,8 +108,12 @@ class ArgumentMappingContext { ...@@ -108,8 +108,12 @@ class ArgumentMappingContext {
virtual bool IsDenseTensorInput(const std::string& name) const = 0; virtual bool IsDenseTensorInput(const std::string& name) const = 0;
virtual bool IsDenseTensorInputs(const std::string& name) const = 0; virtual bool IsDenseTensorInputs(const std::string& name) const = 0;
virtual bool IsSelectedRowsInput(const std::string& name) const = 0; virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
virtual bool IsSparseCooTensorInput(const std::string& name) const = 0; virtual bool IsSparseCooTensorInput(const std::string& name) const = 0;
virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0; virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0;
virtual bool IsSelectedRowsInputs(const std::string& name) const = 0;
// For compatibility with LoDTensorArray // For compatibility with LoDTensorArray
virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0; virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
......
...@@ -100,6 +100,24 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> { ...@@ -100,6 +100,24 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout, default_tensor_layout,
default_key.dtype(), default_key.dtype(),
arg_type); arg_type);
} else if (arg_type == std::type_index(typeid(
const std::vector<const SelectedRows*>&))) {
args_def->AppendInput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
} else if (arg_type == std::type_index(typeid(
const std::vector<const TensorBase*>&))) {
args_def->AppendInput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
} else if (arg_type == std::type_index(typeid(
const std::vector<const TensorArray*>&))) {
args_def->AppendInput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
} else if (arg_type == std::type_index(typeid(const SelectedRows&))) { } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
args_def->AppendInput(default_key.backend(), args_def->AppendInput(default_key.backend(),
default_tensor_layout, default_tensor_layout,
......
...@@ -270,6 +270,8 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> { ...@@ -270,6 +270,8 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(TensorBase);
PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SelectedRows);
PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(DenseTensor);
......
...@@ -39,7 +39,11 @@ int64_t MetaTensor::numel() const { ...@@ -39,7 +39,11 @@ int64_t MetaTensor::numel() const {
DDim MetaTensor::dims() const { DDim MetaTensor::dims() const {
ValidCheck(*this); ValidCheck(*this);
return tensor_->dims(); if (phi::SelectedRows::classof(tensor_)) {
return static_cast<SelectedRows*>(tensor_)->GetCompleteDims();
} else {
return tensor_->dims();
}
} }
DataType MetaTensor::dtype() const { DataType MetaTensor::dtype() const {
...@@ -61,9 +65,7 @@ void MetaTensor::set_dims(const DDim& dims) { ...@@ -61,9 +65,7 @@ void MetaTensor::set_dims(const DDim& dims) {
StringTensorUtils::GetMutableMeta(static_cast<StringTensor*>(tensor_)) StringTensorUtils::GetMutableMeta(static_cast<StringTensor*>(tensor_))
->dims = dims; ->dims = dims;
} else if (phi::SelectedRows::classof(tensor_)) { } else if (phi::SelectedRows::classof(tensor_)) {
DenseTensorUtils::GetMutableMeta( static_cast<SelectedRows*>(tensor_)->set_height(dims[0]);
static_cast<SelectedRows*>(tensor_)->mutable_value())
->dims = dims;
} else if (phi::SparseCooTensor::classof(tensor_)) { } else if (phi::SparseCooTensor::classof(tensor_)) {
DenseTensorUtils::GetMutableMeta(static_cast<SparseCooTensor*>(tensor_)) DenseTensorUtils::GetMutableMeta(static_cast<SparseCooTensor*>(tensor_))
->dims = dims; ->dims = dims;
...@@ -164,7 +166,13 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) { ...@@ -164,7 +166,13 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
} }
} }
TensorBase* MetaTensor::tensor() const { return tensor_; }
bool MetaTensor::is_dense() const { return DenseTensor::classof(tensor_); } bool MetaTensor::is_dense() const { return DenseTensor::classof(tensor_); }
bool MetaTensor::is_selected_rows() const {
return SelectedRows::classof(tensor_);
}
bool MetaTensor::is_tensor_array() const { return false; } bool MetaTensor::is_tensor_array() const { return false; }
void MetaTensor::share_dims(const MetaTensor& meta_tensor) { void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
...@@ -174,7 +182,6 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) { ...@@ -174,7 +182,6 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
bool is_sparse_coo = phi::SparseCooTensor::classof(tensor_); bool is_sparse_coo = phi::SparseCooTensor::classof(tensor_);
bool is_sparse_csr = phi::SparseCsrTensor::classof(tensor_); bool is_sparse_csr = phi::SparseCsrTensor::classof(tensor_);
if (is_dense_tensor || is_selected_rows || is_sparse_coo || is_sparse_csr) { if (is_dense_tensor || is_selected_rows || is_sparse_coo || is_sparse_csr) {
set_dims(meta_tensor.dims());
if (is_selected_rows) { if (is_selected_rows) {
const auto in_tensor_base = meta_tensor.tensor(); const auto in_tensor_base = meta_tensor.tensor();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -186,6 +193,11 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) { ...@@ -186,6 +193,11 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
auto* selected_rows_in = static_cast<SelectedRows*>(in_tensor_base); auto* selected_rows_in = static_cast<SelectedRows*>(in_tensor_base);
selected_rows_out->set_rows(selected_rows_in->rows()); selected_rows_out->set_rows(selected_rows_in->rows());
selected_rows_out->set_height(selected_rows_in->height()); selected_rows_out->set_height(selected_rows_in->height());
DenseTensorUtils::GetMutableMeta(
static_cast<SelectedRows*>(tensor_)->mutable_value())
->dims = selected_rows_in->mutable_value()->dims();
} else {
set_dims(meta_tensor.dims());
} }
} else { } else {
PADDLE_THROW(phi::errors::Unimplemented( PADDLE_THROW(phi::errors::Unimplemented(
...@@ -212,6 +224,4 @@ const LoD& MetaTensor::lod() const { ...@@ -212,6 +224,4 @@ const LoD& MetaTensor::lod() const {
} }
} }
TensorBase* MetaTensor::tensor() const { return tensor_; }
} // namespace phi } // namespace phi
...@@ -68,9 +68,9 @@ class MetaTensor { ...@@ -68,9 +68,9 @@ class MetaTensor {
virtual bool initialized() const; virtual bool initialized() const;
virtual bool is_selected_rows() const;
virtual bool is_dense() const; virtual bool is_dense() const;
// TODO(YuanRisheng) This API is for compatible with // TODO(YuanRisheng) This API is for compatible with Fluid
// Fluid
// and it will be deleted in the future. // and it will be deleted in the future.
virtual bool is_tensor_array() const; virtual bool is_tensor_array() const;
......
...@@ -132,10 +132,7 @@ class SelectedRows : public TensorBase, ...@@ -132,10 +132,7 @@ class SelectedRows : public TensorBase,
/// \brief Returns the dims of the tensor. /// \brief Returns the dims of the tensor.
/// \return The dims of the tensor. /// \return The dims of the tensor.
const DDim& dims() const noexcept override { const DDim& dims() const noexcept override { return impl_->dims(); }
return impl_->dims();
// return phi::make_ddim(dims);
}
/// \brief Returns the data type of the tensor. /// \brief Returns the data type of the tensor.
/// \return The data type of the tensor. /// \return The data type of the tensor.
......
...@@ -301,6 +301,10 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x, ...@@ -301,6 +301,10 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
phi::DDim in_dim({0}); phi::DDim in_dim({0});
for (size_t i = 0; i < x.size(); ++i) { for (size_t i = 0; i < x.size(); ++i) {
auto x_dim = x[i]->dims(); auto x_dim = x[i]->dims();
// x_dim.size() == 1 means the real dim of selected rows is [0]
if (x[i]->is_selected_rows() && x_dim.size() == 1) {
continue;
}
if (phi::product(x_dim) == 0) { if (phi::product(x_dim) == 0) {
continue; continue;
} }
...@@ -355,6 +359,31 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x, ...@@ -355,6 +359,31 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
out->share_lod(*x[0]); out->share_lod(*x[0]);
} }
// TODO(YuanRisheng) This InferMeta is used in Fluid
// and will be deleted in the future.
void AddNTensorArrayInferMeta(const std::vector<const MetaTensor*>& x,
MetaTensor* out,
MetaConfig config) {
int64_t max_length = 0;
bool has_tensor_array = false;
for (auto input : x) {
if (input->is_tensor_array()) {
has_tensor_array = true;
// if input is lod_tensor_array, dims() will return its size (one element)
max_length =
input->dims()[0] > max_length ? input->dims()[0] : max_length;
}
}
if (has_tensor_array) {
if (out->is_tensor_array()) {
out->set_dims(make_ddim({max_length}));
}
} else {
AddNInferMeta(x, out, config);
}
}
void AucInferMeta(const MetaTensor& input, void AucInferMeta(const MetaTensor& input,
const MetaTensor& label, const MetaTensor& label,
const MetaTensor& stat_pos, const MetaTensor& stat_pos,
......
...@@ -123,6 +123,10 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x, ...@@ -123,6 +123,10 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void AddNTensorArrayInferMeta(const std::vector<const MetaTensor*>& x,
MetaTensor* out,
MetaConfig config);
void AucInferMeta(const MetaTensor& input, void AucInferMeta(const MetaTensor& input,
const MetaTensor& label, const MetaTensor& label,
const MetaTensor& stat_pos, const MetaTensor& stat_pos,
......
...@@ -15,12 +15,20 @@ ...@@ -15,12 +15,20 @@
#pragma once #pragma once
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/tensor_array.h"
namespace phi { namespace phi {
// Note(YuanRisheng): std::vector<const TensorBase*> shouldn't be widely used in
// PHI. Here, we use it to be compatible with Fluid.
template <typename T, typename Context> template <typename T, typename Context>
void AddNKernel(const Context& dev_ctx, void AddNKernel(const Context& dev_ctx,
const std::vector<const DenseTensor*>& x, const std::vector<const TensorBase*>& x,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context>
void AddNArrayKernel(const Context& dev_ctx,
const std::vector<const TensorArray*>& x,
TensorArray* out);
} // namespace phi } // namespace phi
...@@ -12,24 +12,21 @@ ...@@ -12,24 +12,21 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/add_n_kernel.h" #include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void AddNKernel(const Context& dev_ctx, void AddNKernel(const Context& dev_ctx,
const std::vector<const DenseTensor*>& x, const std::vector<const TensorBase*>& x,
DenseTensor* out) { DenseTensor* out) {
size_t in_num = x.size(); size_t in_num = x.size();
bool in_place = out == x[0]; dev_ctx.template Alloc<T>(out);
auto* out_ptr = dev_ctx.template Alloc<T>(out);
if (in_num >= 1 && x[0]->initialized()) { bool in_place = false;
if (x[0]->numel() > 0) { if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) {
in_place = (x[0]->data<T>() == out_ptr); if ((static_cast<const DenseTensor*>(x[0]))->Holder() == out->Holder()) {
in_place = true;
} }
} }
...@@ -37,9 +34,11 @@ void AddNKernel(const Context& dev_ctx, ...@@ -37,9 +34,11 @@ void AddNKernel(const Context& dev_ctx,
auto& place = *dev_ctx.eigen_device(); auto& place = *dev_ctx.eigen_device();
int start = in_place ? 1 : 0; int start = in_place ? 1 : 0;
if (!in_place) { if (!in_place) {
if ((in_num >= 2) && x[0]->initialized() && x[1]->initialized()) { if ((in_num >= 2) && DenseTensor::classof(x[0]) &&
auto& in_0 = *x[0]; DenseTensor::classof(x[1]) && x[0]->initialized() &&
auto& in_1 = *x[1]; x[1]->initialized()) {
auto& in_0 = *(static_cast<const DenseTensor*>(x[0]));
auto& in_1 = *(static_cast<const DenseTensor*>(x[1]));
if (in_0.numel() && in_1.numel()) { if (in_0.numel() && in_1.numel()) {
auto in_0_e = EigenVector<T>::Flatten(in_0); auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1); auto in_1_e = EigenVector<T>::Flatten(in_1);
...@@ -49,20 +48,33 @@ void AddNKernel(const Context& dev_ctx, ...@@ -49,20 +48,33 @@ void AddNKernel(const Context& dev_ctx,
} }
if (start != 2) { if (start != 2) {
VLOG(10) << "Fill with constant = 0 in sum kernel."; VLOG(10) << "Fill with constant = 0 in sum kernel.";
funcs::SetConstant<Context, T> constant_functor; phi::funcs::SetConstant<Context, T> constant_functor;
constant_functor(dev_ctx, out, static_cast<T>(0)); constant_functor(dev_ctx, out, static_cast<T>(0));
} }
} }
paddle::operators::math::SelectedRowsAddToTensor<Context, T> functor;
// If in_place, just skip the first tensor // If in_place, just skip the first tensor
for (size_t i = start; i < in_num; i++) { for (size_t i = start; i < in_num; i++) {
auto& in_t = *x[i]; if (DenseTensor::classof(x[i])) {
if (!in_t.initialized() || in_t.numel() == 0) { auto& in_t = *(static_cast<const DenseTensor*>(x[i]));
continue; if (!in_t.initialized() || in_t.numel() == 0) {
continue;
}
auto in = EigenVector<T>::Flatten(in_t);
result.device(place) = result + in;
} else if (SelectedRows::classof(x[i])) {
auto& in_t = *(static_cast<const SelectedRows*>(x[i]));
functor(dev_ctx, in_t, out);
} else {
PADDLE_THROW(phi::errors::InvalidArgument(
"Expected type of Input(X) of %d-th must be Tensor, "
"SelectedRows. But got "
"unsupport type: %s.",
x[i]->type_info().name()));
} }
auto in = EigenVector<T>::Flatten(in_t);
result.device(place) = result + in;
} }
VLOG(10) << "end add_n kernel";
} }
} // namespace phi } // namespace phi
...@@ -76,3 +88,13 @@ PD_REGISTER_KERNEL(add_n, ...@@ -76,3 +88,13 @@ PD_REGISTER_KERNEL(add_n,
int, int,
phi::dtype::bfloat16, phi::dtype::bfloat16,
int64_t) {} int64_t) {}
PD_REGISTER_KERNEL(add_n_array,
CPU,
ALL_LAYOUT,
phi::AddNArrayKernel,
float,
double,
int,
phi::dtype::bfloat16,
int64_t) {}
...@@ -14,16 +14,27 @@ ...@@ -14,16 +14,27 @@
#include "paddle/phi/kernels/add_n_kernel.h" #include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi { namespace phi {
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y)) #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
template <class T>
__global__ void Sum2CUDAKernel(const T *in_0,
const T *in_1,
T *out,
int64_t N) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
out[id] = in_0[id] + in_1[id];
id += blockDim.x * gridDim.x;
}
}
template <class T> template <class T>
__global__ void SumArrayCUDAKernel( __global__ void SumArrayCUDAKernel(
T **in, T *out, int64_t N, size_t in_size, bool read_dst) { T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
...@@ -41,9 +52,26 @@ __global__ void SumArrayCUDAKernel( ...@@ -41,9 +52,26 @@ __global__ void SumArrayCUDAKernel(
} }
} }
template <class T>
__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
int64_t N,
size_t rows) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
for (int i = 0; i < 2 * rows; i += 2) {
const T *tmp = sr_in_out[i];
T *tmp_out = sr_in_out[i + 1];
if (tmp && tmp_out) {
tmp_out[id] += tmp[id];
}
}
id += blockDim.x * gridDim.x;
}
}
template <typename T, typename Context> template <typename T, typename Context>
void AddNKernel(const Context &dev_ctx, void AddNKernel(const Context &dev_ctx,
const std::vector<const DenseTensor *> &x, const std::vector<const TensorBase *> &x,
DenseTensor *out) { DenseTensor *out) {
const size_t in_num = x.size(); const size_t in_num = x.size();
...@@ -66,36 +94,38 @@ void AddNKernel(const Context &dev_ctx, ...@@ -66,36 +94,38 @@ void AddNKernel(const Context &dev_ctx,
grids = dim3(CEIL_DIV(length, tile_size), 1, 1); grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
blocks = dim3(tile_size, 1, 1); blocks = dim3(tile_size, 1, 1);
}; };
auto *out_ptr = dev_ctx.template Alloc<T>(out);
bool in_place = false;
if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) {
if ((static_cast<const DenseTensor *>(x[0]))->data() == out->data()) {
in_place = true;
}
}
bool in_place = x[0] == out; if (!in_place && in_num >= 1 && DenseTensor::classof(x[0])) {
auto &in_0_tensor = *(static_cast<const DenseTensor *>(x[0]));
if (!in_place) { if (in_0_tensor.numel() > 0) {
auto *out_ptr = dev_ctx.template Alloc<T>(out); in_place = (in_0_tensor.data<T>() == out_ptr);
if (in_num >= 1) {
auto &in_0_tensor = *x[0];
if (in_0_tensor.numel() > 0) {
in_place = (in_0_tensor.data<T>() == out_ptr);
}
} }
} }
// Sum of two tensors // Sum of two tensors
if (in_num == 2) { if (in_num == 2 && DenseTensor::classof(x[0]) && DenseTensor::classof(x[1])) {
auto &in_0 = *x[0]; auto &in_0 = *(static_cast<const DenseTensor *>(x[0]));
auto &in_1 = *x[1]; auto &in_1 = *(static_cast<const DenseTensor *>(x[1]));
int64_t length_0 = in_0.numel(); int64_t length_0 = in_0.numel();
int64_t length_1 = in_1.numel(); int64_t length_1 = in_1.numel();
if (length_0 && length_1 && in_0.initialized() && in_1.initialized()) { if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out); auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device(); auto &place = *dev_ctx.eigen_device();
auto in_0_e = EigenVector<T>::Flatten(in_0); auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1); auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e; result.device(place) = in_0_e + in_1_e;
} else if (length_0 && in_0.initialized()) { } else if (length_0 && in_0.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out); auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device(); auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_0); result.device(place) = EigenVector<T>::Flatten(in_0);
} else if (length_1 && in_1.initialized()) { } else if (length_1 && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out); auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device(); auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_1); result.device(place) = EigenVector<T>::Flatten(in_1);
...@@ -105,27 +135,90 @@ void AddNKernel(const Context &dev_ctx, ...@@ -105,27 +135,90 @@ void AddNKernel(const Context &dev_ctx,
int start = in_place ? 1 : 0; int start = in_place ? 1 : 0;
if (!in_place) { if (!in_place) {
funcs::SetConstant<Context, T> constant_functor; phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
constant_functor(dev_ctx, out, static_cast<T>(0)); constant_functor(dev_ctx, out, static_cast<T>(0));
} }
std::vector<const T *> in_data; std::vector<const T *> in_data;
std::vector<int> selectrow_index;
int64_t lod_length = 0; int64_t lod_length = 0;
bool dst_write = false; bool dst_write = false;
for (int i = start; i < in_num; ++i) { for (int i = start; i < in_num; ++i) {
auto &in_i = *x[i]; if (DenseTensor::classof(x[i])) {
lod_length = in_i.numel(); auto &in_i = *(static_cast<const DenseTensor *>(x[i]));
if (lod_length && in_i.initialized()) { lod_length = in_i.numel();
in_data.emplace_back(in_i.data<T>()); if (lod_length && in_i.IsInitialized()) {
in_data.emplace_back(in_i.data<T>());
}
} else if (SelectedRows::classof(x[i])) {
selectrow_index.push_back(i);
} }
} }
// compute select rows separately.
if (!selectrow_index.empty()) {
std::vector<const T *> sr_in_out_data;
size_t rows = 0;
int64_t length = 0;
for (auto index : selectrow_index) {
auto &sr = *(static_cast<const SelectedRows *>(x[index]));
auto &sr_value = sr.value();
auto &sr_rows = sr.rows();
auto row_numel = sr_value.numel() / sr_rows.size();
auto out_dims = out->dims();
PADDLE_ENFORCE_EQ(sr.height(),
out_dims[0],
errors::InvalidArgument(
"The table height of input must be same as output, "
"but received input height is %d"
", output height is %d",
sr.height(),
out_dims[0]));
PADDLE_ENFORCE_EQ(row_numel,
out->numel() / sr.height(),
errors::InvalidArgument(
"The table width of input must be same as output, "
"but received input width is %d"
", output width is %d",
row_numel,
out->numel() / sr.height()));
auto *sr_data = sr_value.data<T>();
auto *sr_out_data = out->data<T>();
rows += sr_rows.size();
length = row_numel;
for (size_t i = 0; i < sr_rows.size(); ++i) {
sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
}
}
if (!sr_in_out_data.empty()) {
auto tmp_sr_in_out_array = paddle::memory::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(sr_in_out_data.data()),
sr_in_out_data.size() * sizeof(T *),
dev_ctx.stream());
T **sr_in_out_array_data =
reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
ComputeKernelParameter(length);
SumSelectedRowsCUDAKernel<T>
<<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
dst_write = true;
}
}
// if indata not null, merge into one kernel call. // if indata not null, merge into one kernel call.
if (!in_data.empty()) { if (!in_data.empty()) {
auto tmp_in_array = paddle::memory::Alloc( auto tmp_in_array =
dev_ctx.GetPlace(), paddle::memory::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
in_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(), tmp_in_array->ptr(),
...@@ -153,6 +246,17 @@ PD_REGISTER_KERNEL(add_n, ...@@ -153,6 +246,17 @@ PD_REGISTER_KERNEL(add_n,
float, float,
double, double,
int, int,
int64_t,
phi::dtype::bfloat16, phi::dtype::bfloat16,
phi::dtype::float16) {} phi::dtype::float16,
int64_t) {}
PD_REGISTER_KERNEL(add_n_array,
GPU,
ALL_LAYOUT,
phi::AddNArrayKernel,
float,
double,
int,
phi::dtype::bfloat16,
phi::dtype::float16,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace phi {
template <typename T, typename Context>
void AddNArrayKernel(const Context& dev_ctx,
const std::vector<const TensorArray*>& x,
TensorArray* out) {
for (auto& ele : *out) {
dev_ctx.template Alloc<T>(&ele);
}
bool in_place = true;
if (x.size() > 0 && x[0]->size() == out->size()) {
for (size_t i = 0; i < out->size(); i++) {
if (x[0]->at(i).IsInitialized() &&
out->at(i).data() != x[0]->at(i).data()) {
in_place = false;
break;
}
}
} else {
in_place = false;
}
for (size_t i = in_place ? 1 : 0; i < x.size(); ++i) {
auto* in_array = x.at(i);
for (size_t j = 0; j < in_array->size(); ++j) {
if (in_array->at(j).IsInitialized() && (in_array->at(j).numel() != 0)) {
if (j >= out->size()) {
out->resize(j + 1);
}
if (!out->at(j).IsInitialized() || (out->at(j).numel() == 0)) {
Copy<Context>(dev_ctx,
in_array->at(j),
in_array->at(j).place(),
false,
&out->at(j));
out->at(j).set_lod(in_array->at(j).lod());
} else {
PADDLE_ENFORCE_EQ(
out->at(j).lod(),
in_array->at(j).lod(),
phi::errors::InvalidArgument(
"The lod message between inputs[%d] and"
" outputs[%d] must be same, but now is not same.",
j,
j));
auto in = EigenVector<T>::Flatten(in_array->at(j));
auto result = EigenVector<T>::Flatten(out->at(j));
result.device(*dev_ctx.eigen_device()) = result + in;
}
}
}
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/selected_rows.h"
namespace phi {
namespace sr {
template <typename T, typename Context>
void AddNKernel(const Context& dev_ctx,
const std::vector<const SelectedRows*>& x,
SelectedRows* out);
} // namespace sr
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
PD_REGISTER_KERNEL(add_n_sr,
CPU,
ALL_LAYOUT,
phi::sr::AddNKernel,
float,
double,
int,
phi::dtype::bfloat16,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
PD_REGISTER_KERNEL(add_n_sr,
GPU,
ALL_LAYOUT,
phi::sr::AddNKernel,
float,
double,
int,
phi::dtype::bfloat16,
phi::dtype::float16,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/selected_rows/add_n_kernel.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace sr {
template <typename T, typename Context>
void AddNKernel(const Context &dev_ctx,
const std::vector<const SelectedRows *> &x,
SelectedRows *out) {
dev_ctx.template Alloc<T>(out->mutable_value());
bool in_place = false;
if (x.size() > 0 && x[0]->value().Holder() == out->value().Holder()) {
in_place = true;
}
if (in_place && x.size() < 2) {
return;
}
std::vector<const phi::SelectedRows *> inputs;
SelectedRows temp_in0;
if (in_place) {
auto &in0 = *x[0];
temp_in0.set_height(in0.height());
temp_in0.set_rows(in0.rows());
Copy<Context>(
dev_ctx, in0.value(), in0.place(), false, temp_in0.mutable_value());
inputs.push_back(&temp_in0);
for (size_t i = 1; i < x.size(); ++i) {
auto &in = *x[i];
if (in.rows().size() > 0) {
inputs.push_back(&in);
}
}
} else {
for (auto in_var : x) {
auto &in = *in_var;
if (in.rows().size() > 0) {
inputs.push_back(in_var);
}
}
}
out->mutable_rows()->clear();
bool has_data = false;
for (auto &in : inputs) {
if (in->rows().size() > 0) {
has_data = true;
break;
}
}
if (has_data) {
paddle::operators::math::scatter::MergeAdd<Context, T> merge_add;
merge_add(dev_ctx, inputs, out);
out->SyncIndex();
} else {
// no data, just set a empty out tensor.
auto *out_dense = out->mutable_value();
out_dense->clear();
out_dense->Resize(phi::make_ddim({0}));
dev_ctx.template Alloc<T>(out_dense);
}
}
} // namespace sr
} // namespace phi
...@@ -27,6 +27,15 @@ KernelSignature MemcpyD2HOpArgumentMapping(const ArgumentMappingContext& ctx) { ...@@ -27,6 +27,15 @@ KernelSignature MemcpyD2HOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("memcpy_d2h", {"X"}, {"dst_place_type"}, {"Out"}); return KernelSignature("memcpy_d2h", {"X"}, {"dst_place_type"}, {"Out"});
} }
KernelSignature MemcpyOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) {
return KernelSignature("memcpy", {"X"}, {"dst_place_type"}, {"Out"});
}
return KernelSignature("unregistered", {}, {}, {});
}
} // namespace phi } // namespace phi
PD_REGISTER_ARG_MAPPING_FN(memcpy_d2h, phi::MemcpyD2HOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(memcpy_d2h, phi::MemcpyD2HOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(memcpy, phi::MemcpyOpArgumentMapping);
...@@ -18,10 +18,13 @@ ...@@ -18,10 +18,13 @@
namespace phi { namespace phi {
KernelSignature SumOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature SumOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInputs("X")) { if (ctx.IsSelectedRowsInputs("X")) {
return KernelSignature("add_n_sr", {"X"}, {}, {"Out"});
} else if (ctx.IsDenseTensorVectorInput("X")) {
return KernelSignature("add_n_array", {"X"}, {}, {"Out"});
} else {
return KernelSignature("add_n", {"X"}, {}, {"Out"}); return KernelSignature("add_n", {"X"}, {}, {"Out"});
} }
return KernelSignature("unregistered", {}, {}, {});
} }
} // namespace phi } // namespace phi
......
...@@ -109,3 +109,7 @@ cc_test( ...@@ -109,3 +109,7 @@ cc_test(
test_strings_lower_upper_api test_strings_lower_upper_api
SRCS test_strings_lower_upper_api.cc SRCS test_strings_lower_upper_api.cc
DEPS ${COMMON_API_TEST_DEPS}) DEPS ${COMMON_API_TEST_DEPS})
cc_test(
test_add_n_api
SRCS test_add_n_api.cc
DEPS ${COMMON_API_TEST_DEPS})
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/selected_rows.h"
PD_DECLARE_KERNEL(add_n_sr, CPU, ALL_LAYOUT);
namespace paddle {
namespace tests {
TEST(API, add_n) {
// 1. create tensor
std::vector<int64_t> rows = {0, 1, 2, 3, 4, 5, 6};
int64_t row_numel = 12;
auto x_sr = std::make_shared<phi::SelectedRows>(rows, 10);
auto x_meta = phi::DenseTensorMeta(
phi::DataType::FLOAT32,
phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
phi::DataLayout::NCHW);
x_sr->mutable_value()->set_meta(x_meta);
x_sr->AllocateFrom(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get(),
phi::DataType::FLOAT32);
auto* dense_x_data = x_sr->mutable_value()->data<float>();
auto y_sr = std::make_shared<phi::SelectedRows>(rows, 10);
y_sr->mutable_value()->set_meta(x_meta);
y_sr->AllocateFrom(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get(),
phi::DataType::FLOAT32);
auto* dense_y_data = y_sr->mutable_value()->data<float>();
float sum[84] = {0.0};
for (size_t i = 0; i < 7; ++i) {
for (size_t j = 0; j < 12; ++j) {
dense_x_data[i * 12 + j] = (i * 4 + j);
dense_y_data[i * 12 + j] = (i * 4 + j);
sum[i * 12 + j] += (i * 4 + j) * 2;
}
}
paddle::experimental::Tensor x(x_sr);
paddle::experimental::Tensor y(y_sr);
auto out = paddle::experimental::add_n_impl({x, y});
// check slice result
ASSERT_EQ(
static_cast<int>(std::dynamic_pointer_cast<phi::SelectedRows>(out.impl())
->rows()
.size()),
7);
for (int64_t i = 0; i < 84; ++i) {
ASSERT_EQ(sum[i],
std::dynamic_pointer_cast<phi::SelectedRows>(out.impl())
->value()
.data<float>()[i]);
}
}
} // namespace tests
} // namespace paddle
...@@ -77,6 +77,10 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext { ...@@ -77,6 +77,10 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
return selected_rows_inputs.count(name) > 0; return selected_rows_inputs.count(name) > 0;
} }
bool IsSelectedRowsInputs(const std::string& name) const override {
return selected_rows_inputs.count(name) > 0;
}
// add member if needed // add member if needed
bool IsDenseTensorVectorInput(const std::string& name) const override { bool IsDenseTensorVectorInput(const std::string& name) const override {
return false; return false;
......
...@@ -1197,9 +1197,7 @@ class Optimizer(object): ...@@ -1197,9 +1197,7 @@ class Optimizer(object):
assert regularization_term is not None assert regularization_term is not None
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if grad.is_dense() and regularization_term.is_dense(): return _C_ops.add_n([grad, regularization_term])
return _C_ops.add_n([grad, regularization_term])
return _legacy_C_ops.sum([grad, regularization_term])
elif framework._in_legacy_dygraph(): elif framework._in_legacy_dygraph():
return _legacy_C_ops.sum([grad, regularization_term]) return _legacy_C_ops.sum([grad, regularization_term])
......
...@@ -1525,9 +1525,6 @@ def add_n(inputs, name=None): ...@@ -1525,9 +1525,6 @@ def add_n(inputs, name=None):
if in_dygraph_mode(): if in_dygraph_mode():
if isinstance(inputs, Variable): if isinstance(inputs, Variable):
inputs = [inputs] inputs = [inputs]
for x in inputs:
if not x.is_dense():
return _legacy_C_ops.sum(inputs, 'use_mkldnn', False)
return _C_ops.add_n(inputs) return _C_ops.add_n(inputs)
if _in_legacy_dygraph(): if _in_legacy_dygraph():
if isinstance(inputs, Variable): if isinstance(inputs, Variable):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册