未验证 提交 5caa6fc5 编写于 作者: C chentianyu03 提交者: GitHub

[PTen] Add variable transform to/from ptenTensor and add cast kernel (#36916)

* add cast kernel

* add cast cuda kernel

* add cast kernel

* make cast kernel output dtype undefined

* get cast dtype from vardesc

* move cast to manipulation and add test case

* add castinfershape

* avoid reinitilaze variable

* InitializeVariable support datatype

* merge develop branch

* fix merge bug

* revert modify initializeVariable

* revert modify on InitializeVariable

* revert modify on InitializeVariable

* mutable support reset dtype

* enable make pten tensor from variable when def_arg.type is undefined

* fix build pten ctx start_idx error

* copy pten out tensor to variable

* merge develop branch

* fix non pten kernel cast failed

* add reset allocation place for remake tensor

* fix inplace realloc error

* add mutable on pten kernles and remove unused cast files

* rename function names

* fix output type error

* fix conflict with develop branch

* set data type to variable with pten's dtype

* fix test_cast_api type mismatch

* densorTensro mutable_data support 0 bytes value

* fix the inplace bug of reshape kernel

* fix pten.backend != variable.place when moving storage, palce mismatch bug

* fix conflict with develop branch

* Fix bug of paddle::experimental::MovesStorage

* fix ReMakePtenDenseTensor place mismatch bug

* Revert "fix ReMakePtenDenseTensor place mismatch bug"

This reverts commit 86336032f60b8a15eacd2c1ff2fa513f5d8dfd1a.

* fix ReMakePtenDenseTensor place mismatch bug

* reverts the set_lod interface, test=develop

* modify by the review options

* modify error message

* add & for const input arguments

* add reference in params

* elementwise_sub add mutable_data

* fix ResetHolderWithType check size bug

* add dependence pten_tensor to test_cast_api object

* remove unused code to pass ci coverage
Co-authored-by: NChen Weihang <chenweihang@baidu.com>
Co-authored-by: NYuanRisheng <yuanrisheng@baidu.com>
Co-authored-by: Nshixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
上级 075c22f6
......@@ -1183,6 +1183,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
BuildPtenKernelContext(*runtime_ctx, dev_ctx);
(*pt_kernel_)(pt_kernel_context_.get());
WriteBackToOutputs(runtime_ctx);
pt_kernel_context_->ClearData();
} else {
(*kernel_func_)(
......@@ -1808,50 +1811,98 @@ void OperatorWithKernel::BuildPtenKernelContext(
for (size_t i = 0; i < input_names.size(); ++i) {
auto& in_def = input_defs.at(i);
auto& ins_vector = ctx.inputs.at(input_names[i]);
if (pt_kernel_context_->InputsSize() <= i) {
// calcute the start and end index of the input tensors
size_t start_idx =
(i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
size_t end_idx = start_idx + ins_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (pt_kernel_context_->InputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
for (auto* var : ins_vector) {
tmp_inputs.emplace_back(
experimental::MakePtenTensorBaseFromVar(*var, in_def));
}
pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
} else {
} else if (pt_kernel_context_->InputsSize() > start_idx) {
size_t input_size = pt_kernel_context_->InputsSize();
for (size_t j = 0; j < ins_vector.size(); ++j) {
if (input_size > i + j) {
if (input_size > start_idx + j) {
experimental::ReMakePtenDenseTensorFromVar(
*ins_vector[j], in_def,
pt_kernel_context_->MutableInputAt<pten::DenseTensor>(i + j));
pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
j));
// TODO(chentianyu03): When multi input kernel, open this code
/*
} else {
pt_kernel_context_->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(*ins_vector[j],
in_def));
*/
}
// TODO(chenweihang): adapt multi-input case later
}
pt_kernel_context_->MutableInputRangeAt(i) =
std::make_pair(i, i + ins_vector.size());
std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.inputs.size() is "
"`%d`.",
start_idx, pt_kernel_context_->InputsSize()));
}
}
for (size_t i = 0; i < output_names.size(); ++i) {
auto& out_def = output_defs.at(i);
auto& outs_vector = ctx.outputs.at(output_names[i]);
if (pt_kernel_context_->OutputsSize() <= i) {
size_t start_idx =
(i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
size_t end_idx = start_idx + outs_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (pt_kernel_context_->OutputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
for (auto* var : outs_vector) {
tmp_outputs.emplace_back(
experimental::MakePtenTensorBaseFromVar(var, out_def));
}
pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
} else {
} else if (pt_kernel_context_->OutputsSize() > start_idx) {
size_t output_size = pt_kernel_context_->OutputsSize();
for (size_t j = 0; j < outs_vector.size(); ++j) {
if (output_size > i + j) {
if (output_size > start_idx + j) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[j], out_def,
pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(i + j));
pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
j));
// TODO(chentianyu03): When multi output kernel, open this code
/*
} else {
pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(outs_vector[j],
out_def));
*/
}
// TODO(chenweihang): adapt multi-output case later
}
pt_kernel_context_->MutableOutputRangeAt(i) =
std::make_pair(i, i + outs_vector.size());
std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.outputs.size() is "
"`%d`.",
start_idx, pt_kernel_context_->OutputsSize()));
}
}
......@@ -1883,14 +1934,23 @@ void OperatorWithKernel::BuildPtenKernelContext(
} else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int64_t>)) &&
std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int>))) {
// Emplace Back Attr according to the type of Pten_Kernel args.
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
vector_int_attr.end());
pt_kernel_context_->EmplaceBackAttr(vector_int64_attr);
std::type_index(typeid(pten::DataType))) {
auto data_type = pten::TransToPtenDataType(
static_cast<framework::proto::VarType::Type>(
BOOST_GET_CONST(int, attr)));
pt_kernel_context_->EmplaceBackAttr(data_type);
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int64_t>))) {
if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int>))) {
// Emplace Back Attr according to the type of Pten_Kernel args.
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
vector_int_attr.end());
pt_kernel_context_->EmplaceBackAttr(vector_int64_attr);
}
// TODO(YuanRisheng) Need support vector<int64_t> attr
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"unsupported cast op attribute `%s` when construct "
......@@ -1901,5 +1961,26 @@ void OperatorWithKernel::BuildPtenKernelContext(
}
}
void OperatorWithKernel::WriteBackToOutputs(RuntimeContext* ctx) const {
// auto& input_names = std::get<0>(pt_kernel_signature_->args);
// auto& attr_names = std::get<1>(pt_kernel_signature_->args);
auto& output_names = std::get<2>(pt_kernel_signature_->args);
// pt_kernel_context_
for (size_t i = 0; i < output_names.size(); ++i) {
auto& outs_vector = ctx->outputs.at(output_names[i]);
auto& range_pair = pt_kernel_context_->OutputRangeAt(i);
auto pten_outs =
pt_kernel_context_->MutableOutputBetween<pten::DenseTensor>(
range_pair.first, range_pair.second);
for (size_t j = 0; j < pten_outs.size(); ++j) {
experimental::MakeVariableFromPtenTensor(pten_outs[j], outs_vector[j]);
}
}
}
} // namespace framework
} // namespace paddle
......@@ -589,6 +589,8 @@ class OperatorWithKernel : public OperatorBase {
void BuildPtenKernelContext(const RuntimeContext& ctx,
platform::DeviceContext* dev_ctx) const;
void WriteBackToOutputs(RuntimeContext* ctx) const;
protected:
mutable std::unique_ptr<OpKernelType> kernel_type_;
mutable std::unique_ptr<OpKernelFunc> kernel_func_;
......
......@@ -204,10 +204,12 @@ void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
}
void Tensor::ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
const proto::VarType::Type type) {
ResetHolder(holder);
const proto::VarType::Type& type) {
type_ = type;
ResetHolder(holder);
}
void Tensor::set_type(const proto::VarType::Type& type) { type_ = type; }
} // namespace framework
} // namespace paddle
......@@ -271,7 +271,9 @@ class Tensor {
void ResetHolder(std::shared_ptr<memory::Allocation> holder);
void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
const proto::VarType::Type type);
const proto::VarType::Type& type);
void set_type(const proto::VarType::Type& type);
TensorInplaceVersion& InplaceVersionCounter() {
return *inplace_version_counter_;
......
......@@ -295,7 +295,16 @@ static void BuildDygraphPtenKernelContext(
for (size_t i = 0; i < input_names.size(); ++i) {
auto& in_def = input_defs.at(i);
auto& ins_vector = ins.at(input_names[i]);
if (kernel_ctx->InputsSize() <= i) {
size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
size_t end_idx = start_idx + ins_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (kernel_ctx->InputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
for (const auto& var : ins_vector) {
const auto& variable = var->Var();
......@@ -303,25 +312,45 @@ static void BuildDygraphPtenKernelContext(
experimental::MakePtenTensorBaseFromVar(variable, in_def));
}
kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
} else {
} else if (kernel_ctx->InputsSize() > start_idx) {
size_t input_size = kernel_ctx->InputsSize();
for (size_t j = 0; j < ins_vector.size(); ++j) {
if (input_size > i + j) {
if (input_size > start_idx + j) {
experimental::ReMakePtenDenseTensorFromVar(
ins_vector[j]->Var(), in_def,
kernel_ctx->MutableInputAt<pten::DenseTensor>(i + j));
kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j));
// TODO(chentianyu03): When multi input kernel, open this code
/*
} else {
kernel_ctx->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
in_def));
*/
}
// TODO(chenweihang): adapt multi-input case later
}
kernel_ctx->MutableInputRangeAt(i) =
std::make_pair(i, i + ins_vector.size());
kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.inputs.size() is "
"`%d`.",
start_idx, kernel_ctx->InputsSize()));
}
}
for (size_t i = 0; i < output_names.size(); ++i) {
auto& out_def = output_defs.at(i);
auto& outs_vector = outs.at(output_names[i]);
if (kernel_ctx->OutputsSize() <= i) {
size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
size_t end_idx = start_idx + outs_vector.size();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or
// outputs in pt_kernel_context_, the current size of input/output can be
// greater then the index of which the tensort wanted to set to, so it will
// use ReMakePtenDenseTensorFromVar to make pten tensor.
if (kernel_ctx->OutputsSize() == start_idx) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
for (auto& var : outs_vector) {
auto* variable = var->MutableVar();
......@@ -329,18 +358,29 @@ static void BuildDygraphPtenKernelContext(
experimental::MakePtenTensorBaseFromVar(variable, out_def));
}
kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
} else {
} else if (kernel_ctx->OutputsSize() > start_idx) {
size_t output_size = kernel_ctx->OutputsSize();
for (size_t j = 0; j < outs_vector.size(); ++j) {
if (output_size > i + j) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[j]->MutableVar(), out_def,
kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
// TODO(chentianyu03): When multi output kernel, open this code
/*
} else {
kernel_ctx->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(
outs_vector[j]->MutableVar(), out_def));
*/
}
// TODO(chenweihang): adapt multi-output case later
}
kernel_ctx->MutableOutputRangeAt(i) =
std::make_pair(i, i + outs_vector.size());
kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.outputs.size() is "
"`%d`.",
start_idx, kernel_ctx->OutputsSize()));
}
}
......@@ -372,14 +412,22 @@ static void BuildDygraphPtenKernelContext(
} else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int64_t>)) &&
std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int>))) {
// Emplace Back Attr according to the type of Pten_Kernel args.
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
vector_int_attr.end());
kernel_ctx->EmplaceBackAttr(vector_int64_attr);
std::type_index(typeid(pten::DataType))) {
auto data_type = pten::TransToPtenDataType(
static_cast<framework::proto::VarType::Type>(
BOOST_GET_CONST(int, attr)));
kernel_ctx->EmplaceBackAttr(data_type);
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int64_t>))) {
if (std::type_index(attr.type()) ==
std::type_index(typeid(std::vector<int>))) {
// Emplace Back Attr according to the type of Pten_Kernel args.
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
vector_int_attr.end());
kernel_ctx->EmplaceBackAttr(vector_int64_attr);
}
// TODO(YuanRisheng) Need support vector<int64_t> attr
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"unsupported cast op attribute `%s` when construct "
......@@ -390,6 +438,26 @@ static void BuildDygraphPtenKernelContext(
}
}
template <typename VarType>
static void WriteBackToOutputs(
const framework::KernelSignature& pt_kernel_signature,
const NameVarMap<VarType>& outs, pten::KernelContext* kernel_ctx) {
auto& output_names = std::get<2>(pt_kernel_signature.args);
for (size_t i = 0; i < output_names.size(); ++i) {
auto& outs_vector = outs.at(output_names[i]);
auto& range_pair = kernel_ctx->OutputRangeAt(i);
auto pten_outs = kernel_ctx->MutableOutputBetween<pten::DenseTensor>(
range_pair.first, range_pair.second);
for (size_t j = 0; j < pten_outs.size(); ++j) {
experimental::MakeVariableFromPtenTensor(pten_outs[j],
outs_vector[j]->MutableVar());
}
}
}
template <typename VarType>
static void PreparedOpRunImpl(
const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
......@@ -414,19 +482,6 @@ static void PreparedOpRunImpl(
op.Type(), outs, dev_ctx->GetPlace());
}
/*For profiling/benchmark only*/
if (FLAGS_benchmark) {
dev_ctx->Wait();
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
#endif
#if defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
#endif
}
/**
* [ Why need handle complex gradient to real gradient? ]
*
......@@ -463,6 +518,20 @@ static void PreparedOpRunPtImpl(
pt_kernel(pt_kernel_context);
if (FLAGS_benchmark) {
dev_ctx->Wait();
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
#endif
#if defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
#endif
}
WriteBackToOutputs<VarType>(pt_kernel_signature, outs, pt_kernel_context);
// Ensure that it does not affect the VarBase life cycle management
pt_kernel_context->ClearData();
......
......@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/transform.h"
#include "paddle/pten/api/lib/utils/tensor_utils.h"
#include "paddle/pten/include/core.h"
#include "paddle/pten/include/manipulation.h"
namespace paddle {
namespace operators {
......@@ -53,11 +57,26 @@ class CastOpKernel : public framework::OpKernel<InT> {
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
framework::VisitDataType(
static_cast<framework::proto::VarType::Type>(
context.Attr<int>("out_dtype")),
CastOpFunctor<DeviceContext, InT>(
in, out, context.template device_context<DeviceContext>()));
auto out_dtype = context.Attr<int>("out_dtype");
// todo: not used in_dtype
auto in_dtype = context.Attr<int>("in_dtype");
auto& dev_ctx = context.device_context<DeviceContext>();
out->mutable_data(dev_ctx.GetPlace(),
static_cast<framework::proto::VarType::Type>(out_dtype));
auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
auto pt_out_dtype = pten::TransToPtenDataType(
static_cast<framework::proto::VarType::Type>(out_dtype));
auto pt_in_dtype = pten::TransToPtenDataType(
static_cast<framework::proto::VarType::Type>(in_dtype));
// call new kernel
pten::Cast<InT>(dev_ctx, *pt_x.get(), pt_out_dtype, pt_in_dtype,
pt_out.get());
}
};
......
......@@ -552,14 +552,13 @@ class Reshape2Op : public ReshapeOp {
const framework::ExecutionContext &ctx) const override {
auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
if (multi_inputs.size() > 0) {
return framework::KernelSignature(
"reshape2.mulhost.mid", {"X", "ShapeTensor"}, {}, {"XShape", "Out"});
return framework::KernelSignature("reshape2.mulhost",
{"X", "ShapeTensor"}, {}, {"Out"});
} else if (ctx.HasInput("Shape")) {
return framework::KernelSignature("reshape2.host.mid", {"X", "Shape"}, {},
{"XShape", "Out"});
return framework::KernelSignature("reshape2.host", {"X", "Shape"}, {},
{"Out"});
} else {
return framework::KernelSignature("reshape2.mid", {"X"}, {"shape"},
{"XShape", "Out"});
return framework::KernelSignature("reshape2", {"X"}, {"shape"}, {"Out"});
}
}
};
......
......@@ -195,4 +195,50 @@ namespace paddle {
// TODO(chenweihang): Add more Marcos in the future if needed
#define PD_VISIT_ALL_TYPES(TYPE, NAME, ...) \
[&] { \
const auto& __dtype__ = TYPE; \
switch (__dtype__) { \
PD_PRIVATE_CASE_TYPE(NAME, ::pten::DataType::BOOL, bool, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, ::pten::DataType::INT8, int8_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::UINT8, uint8_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::INT16, int16_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::UINT16, uint16_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::INT32, int32_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::UINT32, uint32_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::INT64, int64_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::UINT64, uint64_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, \
::pten::DataType::BFLOAT16, \
paddle::experimental::bfloat16, \
__VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, \
::pten::DataType::FLOAT16, \
paddle::experimental::float16, \
__VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::FLOAT32, float, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::pten::DataType::FLOAT64, double, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, \
::pten::DataType::COMPLEX64, \
paddle::experimental::complex64, \
__VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, \
::pten::DataType::COMPLEX128, \
paddle::experimental::complex128, \
__VA_ARGS__) \
default: \
PADDLE_THROW(paddle::platform::errors::InvalidArgument( \
"Invalid enum data type `%d`.", static_cast<int>(__dtype__))); \
} \
}()
} // namespace paddle
......@@ -21,6 +21,8 @@ namespace experimental {
PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype);
PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape);
} // namespace experimental
} // namespace paddle
......@@ -60,6 +60,40 @@ PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
return out;
}
PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype) {
// 1. Get kernel signature and kernel
auto kernel_key_set = ParseKernelKeyByInputArgs(x);
auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
"cast", kernel_key);
// 2. Get Device Context
auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
auto kernel_context = pten::KernelContext(dev_ctx);
// 3. Auto data transform
auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
kernel_context.EmplaceBackInput(dense_x);
kernel_context.EmplaceBackAttr(out_dtype);
kernel_context.EmplaceBackAttr(dense_x->meta().dtype);
// 4. InferShape
auto out_meta = CastInferMeta(dense_x->meta(), out_dtype);
// 5. Prepare outputs
Tensor out;
const auto allocator = std::make_shared<DefaultAllocator>(
pten::TransToFluidPlace(kernel_key.backend()));
auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
kernel_context.EmplaceBackOutput(dense_out);
out.set_impl(dense_out);
// 6. Call kernel
kernel(&kernel_context);
return out;
}
PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape) {
// 1. Get kernel signature and kernel
auto kernel_key_set = ParseKernelKeyByInputArgs(x);
......
......@@ -76,10 +76,6 @@ class SharedStorage : public pten::Storage {
// system, we need to allow the SharedStorage realloc,
// and it can be removed after the compatibility phase is over in the future
void Realloc(size_t n) override {
if (data() != nullptr) {
PADDLE_THROW(paddle::platform::errors::Unavailable(
"The external shared storage cannot be reallocated."));
}
ResetAllocation(paddle::memory::AllocShared(place(), n), 0);
}
......@@ -109,9 +105,16 @@ class SharedStorage : public pten::Storage {
size_ = allocation->size();
}
// Temporary method: For compatible with fluid Tensor and improve performance
void ResetAllocationPlace(const paddle::platform::Place& place) {
data_ = pten::Allocation(nullptr, place);
}
// Temporary method: For compatible with fluid Tensor and improve performance
void Reset() {
allocation_.reset();
if (allocation_ != nullptr) {
allocation_.reset();
}
data_.Clear();
size_ = 0;
}
......
......@@ -54,6 +54,49 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
std::move(meta));
}
std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
const paddle::framework::Tensor& tensor,
const pten::TensorArgDef& arg_def) {
pten::DenseTensorMeta meta{arg_def.dtype,
tensor.dims(),
pten::TransToPtenDataLayout(tensor.layout())};
if (tensor.IsInitialized() &&
tensor.place() == pten::TransToFluidPlace(arg_def.backend)) {
auto shared_storage =
pten::make_intrusive<SharedStorage>(tensor.Holder(), tensor.offset());
return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
std::move(meta));
} else {
return std::make_unique<pten::DenseTensor>(
std::move(pten::make_intrusive<SharedStorage>(
pten::TransToFluidPlace(arg_def.backend))),
std::move(meta));
}
}
std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
const paddle::framework::LoDTensor& tensor,
const pten::TensorArgDef& arg_def) {
pten::DenseTensorMeta meta{arg_def.dtype,
tensor.dims(),
pten::TransToPtenDataLayout(tensor.layout()),
pten::TransToPtenLoD(tensor.lod())};
if (tensor.IsInitialized() &&
tensor.place() == pten::TransToFluidPlace(arg_def.backend)) {
auto shared_storage =
pten::make_intrusive<SharedStorage>(tensor.Holder(), tensor.offset());
return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
std::move(meta));
} else {
return std::make_unique<pten::DenseTensor>(
std::move(pten::make_intrusive<SharedStorage>(
pten::TransToFluidPlace(arg_def.backend))),
std::move(meta));
}
}
std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
auto expected_place = pten::TransToFluidPlace(arg_def.backend);
......@@ -93,17 +136,12 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
// KernelContext to original tensor
if (variable->template IsType<framework::LoDTensor>()) {
auto* tensor = variable->template GetMutable<framework::LoDTensor>();
tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
pten::TransToProtoVarType(arg_def.dtype));
return MakePtenDenseTensor(*tensor);
return MakePtenDenseTensor(*tensor, arg_def);
} else if (variable->template IsType<framework::SelectedRows>()) {
auto* tensor = variable->template GetMutable<framework::SelectedRows>();
tensor->mutable_value()->mutable_data(
pten::TransToFluidPlace(arg_def.backend),
pten::TransToProtoVarType(arg_def.dtype));
// TODO(chenweihang): adapt SelectedRows by xiaowei's design,
// here the row and height will lost in output!
return MakePtenDenseTensor(tensor->value());
return MakePtenDenseTensor(tensor->value(), arg_def);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported shared output `%s` type now when call pt kernel.",
......@@ -122,6 +160,7 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
platform::errors::InvalidArgument(
"The destination Tensor is nullptr when move storage."));
dst->Resize(src->dims());
dst->set_type(pten::TransToProtoVarType(src->dtype()));
auto storage = src->release();
std::shared_ptr<paddle::memory::allocation::Allocation> holder(
new TensorStorage(std::move(storage)));
......@@ -142,40 +181,53 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
}
void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
const pten::TensorArgDef& arg_def,
pten::DenseTensor* dst) {
auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
meta->dims = src.dims();
// Since the type of DenseTensorMeta is const, const_cast must be used
const_cast<DataType&>(meta->dtype) = pten::TransToPtenDataType(src.type());
const_cast<DataType&>(meta->dtype) = arg_def.dtype;
// Since the type of DenseTensorMeta is const, const_cast must be used
const_cast<DataLayout&>(meta->layout) =
pten::TransToPtenDataLayout(src.layout());
auto* shared_storage = static_cast<SharedStorage*>(
pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
PADDLE_ENFORCE_NOT_NULL(
shared_storage,
platform::errors::NotFound(
"Target DenseTensor's shared storage is nullptr."));
shared_storage->ResetAllocation(src.Holder(), src.offset());
if (src.IsInitialized()) {
shared_storage->ResetAllocation(src.Holder(), src.offset());
}
}
void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
const pten::TensorArgDef& arg_def,
pten::DenseTensor* dst) {
auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
meta->dims = src.dims();
// Since the type of DenseTensorMeta is const, const_cast must be used
const_cast<DataType&>(meta->dtype) = pten::TransToPtenDataType(src.type());
const_cast<DataType&>(meta->dtype) = arg_def.dtype;
// Since the type of DenseTensorMeta is const, const_cast must be used
const_cast<DataLayout&>(meta->layout) =
pten::TransToPtenDataLayout(src.layout());
SetLoD(&(meta->lod), src.lod());
auto* shared_storage = static_cast<SharedStorage*>(
pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
PADDLE_ENFORCE_NOT_NULL(
shared_storage,
platform::errors::NotFound(
"Target DenseTensor's shared storage is nullptr."));
shared_storage->ResetAllocation(src.Holder(), src.offset());
if (src.IsInitialized() &&
src.place() == pten::TransToFluidPlace(arg_def.backend)) {
shared_storage->ResetAllocation(src.Holder(), src.offset());
} else {
shared_storage->ResetAllocationPlace(
pten::TransToFluidPlace(arg_def.backend));
}
}
void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
......@@ -188,9 +240,9 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
if (!platform::is_same_place(tensor.place(), expected_place)) {
framework::LoDTensor tmp_tensor;
framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
ReMakePtenDenseTensor(tmp_tensor, dst);
ReMakePtenDenseTensor(tmp_tensor, arg_def, dst);
} else {
ReMakePtenDenseTensor(tensor, dst);
ReMakePtenDenseTensor(tensor, arg_def, dst);
}
} else if (variable.IsType<framework::SelectedRows>()) {
// TODO(chenweihang): now we don't deal with row and height
......@@ -200,9 +252,9 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
framework::Tensor tmp_tensor;
TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
// TODO(chenweihang): adapt SelectedRows by xiaowei's design
ReMakePtenDenseTensor(tmp_tensor, dst);
ReMakePtenDenseTensor(tmp_tensor, arg_def, dst);
} else {
ReMakePtenDenseTensor(tensor.value(), dst);
ReMakePtenDenseTensor(tensor.value(), arg_def, dst);
}
} else {
PADDLE_THROW(platform::errors::Unimplemented(
......@@ -218,18 +270,12 @@ void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
// KernelContext to original tensor
if (variable->template IsType<framework::LoDTensor>()) {
auto* tensor = variable->template GetMutable<framework::LoDTensor>();
// TODO(chenweihang): use original var type if arg_def.dtype is UNDEFINED
tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
pten::TransToProtoVarType(arg_def.dtype));
ReMakePtenDenseTensor(*tensor, dst);
ReMakePtenDenseTensor(*tensor, arg_def, dst);
} else if (variable->template IsType<framework::SelectedRows>()) {
auto* tensor = variable->template GetMutable<framework::SelectedRows>();
tensor->mutable_value()->mutable_data(
pten::TransToFluidPlace(arg_def.backend),
pten::TransToProtoVarType(arg_def.dtype));
// TODO(chenweihang): adapt SelectedRows by xiaowei's design,
// here the row and height will lost in output!
ReMakePtenDenseTensor(tensor->value(), dst);
ReMakePtenDenseTensor(tensor->value(), arg_def, dst);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported shared output `%s` type now when call pt kernel.",
......@@ -237,5 +283,53 @@ void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
}
}
static bool IsSameAllocation(const std::shared_ptr<memory::Allocation>& a,
const std::shared_ptr<memory::Allocation>& b) {
return a->ptr() == b->ptr() && a->size() == b->size() &&
platform::is_same_place(a->place(), b->place());
}
void MakeVariableFromPtenTensor(pten::DenseTensor* src,
framework::Variable* variable) {
if (variable->IsType<framework::LoDTensor>()) {
auto* tensor = variable->GetMutable<framework::LoDTensor>();
auto dtype = pten::TransToProtoVarType(src->dtype());
tensor->Resize(src->dims());
SetLoD(tensor->mutable_lod(), src->lod());
// here dynamic_cast is slow
auto* storage = static_cast<SharedStorage*>(
pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
if (!tensor->IsInitialized() ||
(tensor->IsInitialized() &&
!IsSameAllocation(tensor->Holder(), storage->GetAllocation()))) {
tensor->ResetHolderWithType(std::move(storage->GetAllocation()), dtype);
} else {
// Even the pten tensor and Variable have the same Alloctation (both have
// the same pointer address, same size and same place)
// but there is possible that they do not have the same data_type.
// so, here we set the variable's type with the pten tensor dtype.
tensor->set_type(dtype);
}
} else if (variable->IsType<framework::SelectedRows>()) {
auto* tensor = variable->GetMutable<framework::SelectedRows>();
auto dtype = pten::TransToProtoVarType(src->dtype());
if (!tensor->value().IsInitialized()) {
auto storage = dynamic_cast<SharedStorage*>(
pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
tensor->mutable_value()->ResetHolderWithType(
std::move(storage->GetAllocation()), dtype);
}
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported shared input `%s` type now when call pt kernel.",
framework::ToTypeName(variable->Type())));
}
}
} // namespace experimental
} // namespace paddle
......@@ -55,9 +55,11 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
*/
void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
const pten::TensorArgDef& arg_def,
pten::DenseTensor* dst);
void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
const pten::TensorArgDef& arg_def,
pten::DenseTensor* dst);
void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
......@@ -68,5 +70,8 @@ void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
const pten::TensorArgDef& arg_def,
pten::DenseTensor* dst);
void MakeVariableFromPtenTensor(pten::DenseTensor* src,
framework::Variable* variable);
} // namespace experimental
} // namespace paddle
......@@ -42,8 +42,10 @@ class CompatibleDenseTensorUtils {
// only can deal with SharedStorage now
static void ClearStorage(DenseTensor* tensor) {
// use static_cast to improve performance, replace by dynamic_cast later
static_cast<paddle::experimental::SharedStorage*>(tensor->storage_.get())
->Reset();
if (tensor->storage_ != nullptr) {
static_cast<paddle::experimental::SharedStorage*>(tensor->storage_.get())
->Reset();
}
}
static DenseTensor Slice(DenseTensor* tensor,
......
......@@ -160,4 +160,24 @@ paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout) {
}
}
paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod) {
paddle::framework::LoD out;
out.reserve(lod.size());
for (auto& elem : lod) {
out.emplace_back(elem);
}
return out;
}
pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod) {
pten::LoD out;
out.reserve(lod.size());
for (auto& elem : lod) {
out.emplace_back(elem);
}
return out;
}
} // namespace pten
......@@ -17,10 +17,12 @@ limitations under the License. */
#include "paddle/pten/common/backend.h"
#include "paddle/pten/common/data_type.h"
#include "paddle/pten/common/layout.h"
#include "paddle/pten/core/tensor_meta.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/place.h"
// TODO(chenweihang): this file may need to be removed
......@@ -40,4 +42,7 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
const DataType& dtype);
paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout);
paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod);
pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod);
} // namespace pten
......@@ -69,7 +69,9 @@ void* DenseTensor::mutable_data(size_t request_bytes) {
bytes));
bytes = request_bytes;
}
if (storage_->size() < bytes) {
if (storage_->size() < bytes || storage_->size() == 0) {
VLOG(10) << "mutbale data realloc, original size: " << storage_->size()
<< ", new size: " << bytes;
storage_->Realloc(bytes);
}
return storage_->data();
......@@ -81,6 +83,8 @@ T* DenseTensor::mutable_data() {
// execution system, we have to reset the datatype in mutable_data<T>.
// When the compatibility phase is over in the future, we can delete it
if (meta_.dtype == DataType::UNDEFINED) {
VLOG(10) << "change data type in mutbale_data, target dtype - "
<< paddle::experimental::CppTypeToDataType<T>::Type();
const_cast<DataType&>(meta_.dtype) =
paddle::experimental::CppTypeToDataType<T>::Type();
}
......@@ -120,12 +124,13 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
meta_ = std::move(meta);
}
void DenseTensor::Resize(const DDim& dims, const LoD& lod) {
void DenseTensor::Resize(const DDim& dims) {
meta_.dims = dims;
meta_.lod = lod;
mutable_data();
}
void DenseTensor::ResetLoD(const LoD& lod) { meta_.lod = lod; }
#define DATA_MEMBER_FUNC_INSTANTIATION(dtype) \
template dtype* DenseTensor::mutable_data(); \
template const dtype* DenseTensor::data() const;
......
......@@ -127,7 +127,11 @@ class DenseTensor : public TensorBase,
/// larger than the original value, the storage area will be reallocated.
/// \param dims The new dims of the dense tensor.
/// \param lod The new lod of the dense tensor.
void Resize(const DDim& dims, const LoD& lod = {});
void Resize(const DDim& dims);
/// \brief Change the lod information in the metadata.
/// \param lod The new lod of the dense tensor.
void ResetLoD(const LoD& lod);
/// \brief Returns the actual storage size occupied by tensor, may be larger
/// than its shape dims.
......
......@@ -58,6 +58,10 @@ class KernelContext {
input_range_.emplace_back(std::pair<int, int>(index, index + 1));
}
void EmplaceBackInputWithoutSetRange(std::shared_ptr<TensorBase> input) {
inputs_.emplace_back(std::move(input));
}
void EmplaceBackInputs(
paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
int index = inputs_.size();
......@@ -76,6 +80,10 @@ class KernelContext {
output_range_.emplace_back(std::pair<int, int>(index, index + 1));
}
void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output) {
outputs_.emplace_back(std::move(output));
}
void EmplaceBackOutputs(
paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
int index = outputs_.size();
......@@ -171,9 +179,6 @@ class KernelContext {
size_t OutputsSize() const { return outputs_.size(); }
size_t AttrsSize() const { return attrs_.size(); }
private:
bool IsDuplicable() const { return input_range_.size() != inputs_.size(); }
private:
// DeviceContext base class
DeviceContext* dev_ctx_;
......
......@@ -207,6 +207,7 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
/* Output Helpers */
......
......@@ -37,6 +37,20 @@ DenseTensor Flatten(const ContextT& dev_ctx,
return dense_out;
}
template <typename T, typename ContextT>
DenseTensor Cast(const ContextT& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype) {
auto out_meta = CastInferMeta(x.meta(), out_dtype);
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
Cast<T>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
return dense_out;
}
template <typename T, typename ContextT>
DenseTensor Reshape(const ContextT& dev_ctx,
const DenseTensor& x,
......
......@@ -74,6 +74,12 @@ DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
return return_meta;
}
DenseTensorMeta CastInferMeta(const DenseTensorMeta& x_meta,
const DataType out_dtype) {
DenseTensorMeta out_meta(out_dtype, x_meta.dims, x_meta.layout);
return out_meta;
}
DenseTensorMeta FullLikeInferShape(const DenseTensorMeta& x_meta,
DataType dtype,
DataLayout layout) {
......
......@@ -40,6 +40,8 @@ DenseTensorMeta ReductionInferShape(const DenseTensorMeta& x_meta);
DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
int start_axis,
int stop_axis);
DenseTensorMeta CastInferMeta(const DenseTensorMeta& x_meta,
const DataType out_dtype);
DenseTensorMeta FullLikeInferShape(const DenseTensorMeta& x_meta,
DataType dtype,
......
......@@ -13,9 +13,11 @@
// limitations under the License.
#include "paddle/pten/kernels/cpu/manipulation.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/infermeta/unary.h"
#include "paddle/pten/kernels/cpu/utils.h"
#include "paddle/pten/kernels/functions/general/manipulation.h"
#include "paddle/pten/kernels/functions/math/cast_func.h"
namespace pten {
......@@ -44,27 +46,17 @@ void FlattenWithXShape(const CPUContext& dev_ctx,
general::SetXShape(x, xshape);
}
void ReshapeFromVectorValImpl(const CPUContext& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& shape,
DenseTensor* out,
bool set_lod) {
auto out_meta = InferShapeFromVecValue(x.meta(), shape);
if (&x != out) {
pten::Copy(dev_ctx, x, false, out);
}
if (set_lod) {
out->Resize(out_meta.dims, out_meta.lod);
} else {
out->Resize(out_meta.dims);
}
}
void ReshapeFromVectorVal(const CPUContext& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& shape,
DenseTensor* out) {
ReshapeFromVectorValImpl(dev_ctx, x, shape, out, false);
auto out_meta = InferShapeFromVecValue(x.meta(), shape);
if (&x == out) {
out->Resize(out_meta.dims);
return;
}
pten::Copy(dev_ctx, x, false, out);
out->Resize(out_meta.dims);
}
void ReshapeFromVectorValWithXShape(const CPUContext& dev_ctx,
......@@ -72,8 +64,8 @@ void ReshapeFromVectorValWithXShape(const CPUContext& dev_ctx,
const std::vector<int64_t>& shape,
DenseTensor* xshape,
DenseTensor* out) {
ReshapeFromVectorVal(dev_ctx, x, shape, out);
general::SetXShape(x, xshape);
ReshapeFromVectorVal(dev_ctx, x, shape, out);
}
void ReshapeFromDT(const CPUContext& dev_ctx,
......@@ -83,7 +75,8 @@ void ReshapeFromDT(const CPUContext& dev_ctx,
auto* shape_data = shape.data<int>();
auto vector_shape =
std::vector<int64_t>(shape_data, shape_data + shape.numel());
ReshapeFromVectorValImpl(dev_ctx, x, vector_shape, out, true);
ReshapeFromVectorVal(dev_ctx, x, vector_shape, out);
out->ResetLoD(x.lod());
}
void ReshapeFromDTWithXShape(const CPUContext& dev_ctx,
......@@ -91,8 +84,8 @@ void ReshapeFromDTWithXShape(const CPUContext& dev_ctx,
const DenseTensor& shape,
DenseTensor* xshape,
DenseTensor* out) {
ReshapeFromDT(dev_ctx, x, shape, out);
general::SetXShape(x, xshape);
ReshapeFromDT(dev_ctx, x, shape, out);
}
void ReshapeFromVectorDT(const CPUContext& dev_ctx,
......@@ -119,8 +112,20 @@ void ReshapeFromVectorDTWithXShape(const CPUContext& dev_ctx,
const std::vector<DenseTensor>& shape,
DenseTensor* xshape,
DenseTensor* out) {
ReshapeFromVectorDT(dev_ctx, x, shape, out);
general::SetXShape(x, xshape);
ReshapeFromVectorDT(dev_ctx, x, shape, out);
}
template <typename T>
void Cast(const CPUContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out) {
PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
math::CastKernelImpl<CPUContext, T, data_t>(
dev_ctx, x, out);
}));
}
} // namespace pten
......@@ -151,6 +156,23 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
int8_t,
int,
int64_t) {}
PT_REGISTER_KERNEL("cast",
CPU,
ANY,
pten::Cast,
float,
double,
int,
int64_t,
int16_t,
bool,
uint8_t,
paddle::platform::float16,
paddle::platform::bfloat16,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
// TODO(yuanrisheng): "reshape2" is compatible with old kernel
// architecture, kernel_name should be "reshape".
......
......@@ -29,6 +29,13 @@ void Flatten(const CPUContext& dev_ctx,
int stop_axis,
DenseTensor* out);
template <typename T>
void Cast(const CPUContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out);
void ReshapeFromDT(const CPUContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& shape,
......
......@@ -70,6 +70,9 @@ void ElementwiseAdd(const CPUContext& dev_ctx,
const DenseTensor& y,
int axis,
DenseTensor* out) {
// allocate memory for out
out->mutable_data<T>();
if (x.dims() == y.dims()) {
SameDimsElementwiseCompute<general::SameDimsAddFunctor<CPUContext, T>>()(
dev_ctx, x, y, out);
......@@ -92,6 +95,9 @@ void ElementwiseSub(const CPUContext& dev_ctx,
const DenseTensor& y,
int axis,
DenseTensor* out) {
// allocate memory for out
out->mutable_data<T>();
if (x.dims() == y.dims()) {
SameDimsElementwiseCompute<general::SameDimsSubFunctor<CPUContext, T>>()(
dev_ctx, x, y, out);
......
......@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/infermeta/unary.h"
#include "paddle/pten/kernels/cuda/manipulation.h"
#include "paddle/pten/kernels/cuda/utils.h"
#include "paddle/pten/kernels/functions/general/manipulation.h"
#include "paddle/pten/kernels/functions/math/cast_func.h"
namespace pten {
......@@ -44,27 +46,17 @@ void FlattenWithXShape(const CUDAContext& dev_ctx,
general::SetXShape(x, xshape);
}
void ReshapeFromVectorValImpl(const CUDAContext& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& shape,
DenseTensor* out,
bool set_lod) {
auto out_meta = InferShapeFromVecValue(x.meta(), shape);
if (&x != out) {
pten::Copy(dev_ctx, x, false, out);
}
if (set_lod) {
out->Resize(out_meta.dims, out_meta.lod);
} else {
out->Resize(out_meta.dims);
}
}
void ReshapeFromVectorVal(const CUDAContext& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& shape,
DenseTensor* out) {
ReshapeFromVectorValImpl(dev_ctx, x, shape, out, false);
auto out_meta = InferShapeFromVecValue(x.meta(), shape);
if (&x == out) {
out->Resize(out_meta.dims);
return;
}
pten::Copy(dev_ctx, x, false, out);
out->Resize(out_meta.dims);
}
void ReshapeFromVectorValWithXShape(const CUDAContext& dev_ctx,
......@@ -72,8 +64,8 @@ void ReshapeFromVectorValWithXShape(const CUDAContext& dev_ctx,
const std::vector<int64_t>& shape,
DenseTensor* xshape,
DenseTensor* out) {
ReshapeFromVectorVal(dev_ctx, x, shape, out);
general::SetXShape(x, xshape);
ReshapeFromVectorVal(dev_ctx, x, shape, out);
}
void ReshapeFromDT(const CUDAContext& dev_ctx,
......@@ -83,7 +75,8 @@ void ReshapeFromDT(const CUDAContext& dev_ctx,
auto* shape_data = shape.data<int>();
auto vector_shape =
std::vector<int64_t>(shape_data, shape_data + shape.numel());
ReshapeFromVectorValImpl(dev_ctx, x, vector_shape, out, true);
ReshapeFromVectorVal(dev_ctx, x, vector_shape, out);
out->ResetLoD(x.lod());
}
void ReshapeFromDTWithXShape(const CUDAContext& dev_ctx,
......@@ -91,8 +84,8 @@ void ReshapeFromDTWithXShape(const CUDAContext& dev_ctx,
const DenseTensor& shape,
DenseTensor* xshape,
DenseTensor* out) {
ReshapeFromDT(dev_ctx, x, shape, out);
general::SetXShape(x, xshape);
ReshapeFromDT(dev_ctx, x, shape, out);
}
void ReshapeFromVectorDT(const CUDAContext& dev_ctx,
......@@ -119,8 +112,20 @@ void ReshapeFromVectorDTWithXShape(const CUDAContext& dev_ctx,
const std::vector<DenseTensor>& shape,
DenseTensor* xshape,
DenseTensor* out) {
ReshapeFromVectorDT(dev_ctx, x, shape, out);
general::SetXShape(x, xshape);
ReshapeFromVectorDT(dev_ctx, x, shape, out);
}
template <typename T>
void Cast(const CUDAContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out) {
PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
math::CastKernelImpl<CUDAContext, T, data_t>(
dev_ctx, x, out);
}));
}
} // namespace pten
......@@ -153,6 +158,23 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
int8_t,
int,
int64_t) {}
// todo: Hip need support bfloat16
PT_REGISTER_KERNEL("cast",
CUDA,
ANY,
pten::Cast,
float,
double,
int,
int64_t,
int16_t,
bool,
uint8_t,
paddle::platform::float16,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2",
CUDA,
......
......@@ -33,6 +33,13 @@ void Flatten(const CUDAContext& dev_ctx,
int stop_axis,
DenseTensor* out);
template <typename T>
void Cast(const CUDAContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out);
void ReshapeFromDT(const CUDAContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& shape,
......
......@@ -134,6 +134,8 @@ void ElementwiseAdd(const CUDAContext& dev_ctx,
std::vector<DenseTensor*> outputs;
inputs.emplace_back(&x);
inputs.emplace_back(&y);
// allocate memory for out
out->mutable_data<T>();
outputs.emplace_back(out);
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, inputs, &outputs, axis, general::AddFunctor<T>());
......@@ -149,6 +151,8 @@ void ElementwiseSub(const CUDAContext& dev_ctx,
std::vector<DenseTensor*> outputs;
inputs.emplace_back(&x);
inputs.emplace_back(&y);
// allocate memory for out
out->mutable_data<T>();
outputs.emplace_back(out);
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, inputs, &outputs, axis, general::SubFunctor<T>());
......
......@@ -147,6 +147,7 @@ void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx,
int axis,
Functor func,
DenseTensor *z) {
z->mutable_data<OutType>();
auto x_dims = x.dims();
auto y_dims = y.dims();
bool is_xsize_larger = true;
......
......@@ -28,6 +28,7 @@ void Dot(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
out->mutable_data<T>();
if (1 == out->dims().size()) {
auto eigen_out = pten::EigenScalar<T>::From(*out);
auto eigen_x = pten::EigenVector<T>::Flatten(x);
......
......@@ -25,6 +25,7 @@ void ElementwiseAdd(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
out->mutable_data<T>();
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_y = pten::EigenVector<T>::Flatten(y);
auto eigen_z = pten::EigenVector<T>::Flatten(*out);
......
......@@ -28,6 +28,7 @@ void Mean(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
// TODO(chenweihang): if we design new tensor, we should support
// the low-level calc functor use new tensor as input,
// which may be a big project!
out->mutable_data<T>();
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_out = pten::EigenScalar<T>::From(*out);
......
......@@ -26,7 +26,8 @@ inline void SetXShape(const DenseTensor& x, DenseTensor* xshape) {
for (int i = 0; i < in_dims.size(); ++i) {
xshape_dims[i + 1] = in_dims[i];
}
xshape->Resize(paddle::framework::make_ddim(xshape_dims), x.meta().lod);
xshape->Resize(paddle::framework::make_ddim(xshape_dims));
xshape->ResetLoD(x.meta().lod);
}
} // namespace general
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/transform.h"
#include "paddle/pten/core/dense_tensor.h"
namespace pten {
namespace math {
template <typename InT, typename OutT>
struct CastOpTransformFunctor {
HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
};
template <typename DeviceContext, typename InT, typename OutT>
void CastKernelImpl(const DeviceContext& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
auto* in_begin = x.data<InT>();
auto numel = x.numel();
auto* in_end = in_begin + numel;
auto* out_begin = out->mutable_data<OutT>();
paddle::platform::Transform<DeviceContext> trans;
trans(dev_ctx,
in_begin,
in_end,
out_begin,
CastOpTransformFunctor<InT, OutT>());
}
} // namespace math
} // namespace pten
......@@ -47,7 +47,8 @@ void FlattenWithXShape(const XPUContext& dev_ctx,
for (int i = 0; i < in_dims.size(); ++i) {
xshape_dims[i + 1] = in_dims[i];
}
xshape->Resize(paddle::framework::make_ddim(xshape_dims), x.meta().lod);
xshape->Resize(paddle::framework::make_ddim(xshape_dims));
xshape->ResetLoD(x.lod());
}
void ReshapeFromVectorVal(const XPUContext& dev_ctx,
......
......@@ -7,13 +7,13 @@ endif()
cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_api_utils)
cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_api_utils)
cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_cast_api SRCS test_cast_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/pten/api/include/manipulation.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
PT_DECLARE_MODULE(ManipulationCPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE(ManipulationCUDA);
#endif
namespace framework = paddle::framework;
using DDim = paddle::framework::DDim;
// TODO(chenweihang): Remove this test after the API is used in the dygraph
TEST(API, cast) {
// 1. create tensor
const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
auto dense_x = std::make_shared<pten::DenseTensor>(
alloc,
pten::DenseTensorMeta(pten::DataType::FLOAT32,
framework::make_ddim({3, 4}),
pten::DataLayout::NCHW));
auto* dense_x_data = dense_x->mutable_data<float>();
for (int i = 0; i < dense_x->numel(); i++) {
dense_x_data[i] = i;
}
paddle::experimental::Tensor x(dense_x);
pten::DataType out_dtype = pten::DataType::FLOAT64;
// 2. test API
auto out = paddle::experimental::cast(x, out_dtype);
// 3. check result
std::vector<int> expect_shape = {3, 4};
ASSERT_EQ(out.shape().size(), size_t(2));
ASSERT_EQ(out.shape()[0], expect_shape[0]);
ASSERT_EQ(out.shape()[1], expect_shape[1]);
ASSERT_EQ(out.numel(), 12);
ASSERT_EQ(out.is_cpu(), true);
ASSERT_EQ(out.type(), pten::DataType::FLOAT64);
ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
ASSERT_EQ(out.initialized(), true);
auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
auto* dense_out_data = dense_out->data<double>();
for (int i = 0; i < dense_x->numel(); i++) {
ASSERT_NEAR(dense_out_data[i], static_cast<double>(dense_x_data[i]), 1e-6f);
}
}
......@@ -4,5 +4,6 @@ cc_test(test_fill_dev_api SRCS test_fill_dev_api.cc DEPS pten pten_api_utils)
cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS pten pten_api_utils)
cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS pten pten_api_utils)
cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS pten pten_api_utils)
cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS pten pten_api_utils)
cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS pten pten_api_utils)
cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS pten pten_api_utils)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/pten/include/manipulation.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/common/data_type.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
PT_DECLARE_MODULE(ManipulationCPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE(ManipulationCUDA);
#endif
namespace framework = paddle::framework;
using DDim = paddle::framework::DDim;
TEST(DEV_API, cast) {
// 1. create tensor
const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
pten::DenseTensor dense_x(alloc,
pten::DenseTensorMeta(pten::DataType::FLOAT32,
framework::make_ddim({3, 4}),
pten::DataLayout::NCHW));
auto* dense_x_data = dense_x.mutable_data<float>();
float sum = 0.0;
for (size_t i = 0; i < 12; ++i) {
dense_x_data[i] = i * 1.0;
sum += i * 1.0;
}
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
pten::DataType out_dtype = pten::DataType::FLOAT64;
pten::DataType in_dtype = pten::DataType::FLOAT32;
// 2. test API
auto out = pten::Cast<float>(
*(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
dense_x,
out_dtype,
in_dtype);
// 3. check result
ASSERT_EQ(out.dims().size(), 2);
ASSERT_EQ(out.dims()[0], 3);
ASSERT_EQ(out.dims()[1], 4);
ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT64);
ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
auto actual_result = out.data<double>();
for (size_t i = 0; i < 12; ++i) {
ASSERT_NEAR(actual_result[i], static_cast<double>(dense_x_data[i]), 1e-6f);
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册