未验证 提交 5a000900 编写于 作者: C Chen Weihang 提交者: GitHub

[PTen] Add copy_to and to method for Tensor (#37262)

* add copy_to and to method for Tensor

* polish msg format

* fix details error

* fix copy_to test compile failed

* fix typo
上级 39012536
add_subdirectory(lib)
cc_library(pten_api SRCS all.cc DEPS linalg_api math_api creation_api manipulation_api)
cc_library(pten_api SRCS all.cc DEPS linalg_api math_api creation_api manipulation_api utils_api)
......@@ -30,6 +30,7 @@ limitations under the License. */
#include "paddle/pten/api/include/manipulation.h"
#include "paddle/pten/api/include/math.h"
#include "paddle/pten/api/include/tensor.h"
#include "paddle/pten/api/include/utils.h"
// pten common headers
#include "paddle/pten/common/backend.h"
......@@ -51,3 +52,4 @@ PT_DECLARE_API(Creation);
PT_DECLARE_API(Linalg);
PT_DECLARE_API(Manipulation);
PT_DECLARE_API(Math);
PT_DECLARE_API(Utils);
......@@ -31,6 +31,7 @@ using gpuStream_t = hipStream_t;
#include "paddle/pten/api/ext/dll_decl.h"
#include "paddle/pten/api/ext/place.h"
#include "paddle/pten/common/backend.h"
#include "paddle/pten/common/data_type.h"
#include "paddle/pten/common/layout.h"
......@@ -317,9 +318,11 @@ class PD_DLL_DECL Tensor final {
/**
* @brief Copy the current Tensor data to the specified device
* and return the new Tensor.
* It's usually used to set the input tensor data.
* This is a deprecated method and may be removed in the future!
* and return the new Tensor. It's usually used to set the input tensor data.
* Note: The Tensor's `copy_to` method is deprecated since version 2.3, and
* will be removed in version 2.4, please use `to` method instead. reason:
* copying a Tensor to another device does not need to specify the
* data type template argument
*
* @tparam T
* @param target_place, the target place of which the tensor will copy to.
......@@ -334,7 +337,9 @@ class PD_DLL_DECL Tensor final {
* @param place, the target place of which the tensor will copy to.
* @return Tensor
*/
Tensor to(const PlaceType& place) const;
// TODO(chenweihang): replace Backend by new Place, may be append dtype and
// layout arguments in the future
Tensor to(Backend backend, bool blocking) const;
/**
* @brief Cast datatype from one to another
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/api/include/tensor.h"
#include "paddle/pten/common/backend.h"
namespace paddle {
namespace experimental {
// TODO(chenweihang): Replace backend by place when place is ready
// TODO(chenweihang): Add layout and dtype argument if needed
PD_DLL_DECL Tensor to(const Tensor& x, Backend backend, bool blocking);
} // namespace experimental
} // namespace paddle
......@@ -18,3 +18,4 @@ cc_library(math_api SRCS math.cc DEPS pten_tensor pten kernel_dispatch)
cc_library(linalg_api SRCS linalg.cc DEPS pten_tensor pten kernel_dispatch)
cc_library(creation_api SRCS creation.cc DEPS pten_tensor pten kernel_dispatch)
cc_library(manipulation_api SRCS manipulation.cc DEPS pten_tensor pten kernel_dispatch)
cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
......@@ -18,7 +18,7 @@ limitations under the License. */
namespace paddle {
namespace experimental {
platform::Place ConvertExtPlaceToInnerPlace(const PlaceType& p) {
platform::Place ConvertExtPlaceToInnerPlace(PlaceType p) {
if (p == PlaceType::kCPU) {
return platform::Place(platform::CPUPlace());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -50,5 +50,21 @@ PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p) {
return PlaceType::kUNK;
}
Backend ConvertExtPlaceToBackend(PlaceType p) {
switch (p) {
case PlaceType::kCPU:
return Backend::CPU;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
case PlaceType::kGPU:
return Backend::CUDA;
#endif
default:
PADDLE_THROW(
platform::errors::Unimplemented("Unsupported place type `%s` when "
"casting enum place to backend.",
static_cast<int>(p)));
}
}
} // namespace experimental
} // namespace paddle
......@@ -16,13 +16,16 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/pten/api/ext/place.h"
#include "paddle/pten/common/backend.h"
namespace paddle {
namespace experimental {
platform::Place ConvertExtPlaceToInnerPlace(const PlaceType& p);
platform::Place ConvertExtPlaceToInnerPlace(PlaceType p);
PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p);
Backend ConvertExtPlaceToBackend(PlaceType p);
} // namespace experimental
} // namespace paddle
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/pten/api/include/utils.h"
#include "paddle/pten/api/lib/ext_compat_utils.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/api/lib/utils/storage.h"
......@@ -279,10 +280,12 @@ gpuStream_t Tensor::stream() const {
template <typename T>
Tensor Tensor::copy_to(const PlaceType &target_place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"The copy_to operation is not supported now, "
"and it will be implemented by calling the copy kernel later."));
return Tensor();
LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version "
"2.3, and will be removed in version 2.4, please use `to` "
"method instead. "
"reason: copying a Tensor to another device does not need "
"to specify the data type template argument.";
return to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
}
template PD_DLL_DECL Tensor
......@@ -308,11 +311,8 @@ template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
template PD_DLL_DECL Tensor
Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
Tensor Tensor::to(const PlaceType &target_place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"The to operation is not supported now, "
"and it will be implemented by calling the copy kernel later."));
return Tensor();
Tensor Tensor::to(Backend backend, bool blocking) const {
return experimental::to(*this, backend, blocking);
}
Tensor Tensor::cast(const DataType &target_type) const {
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/api/include/utils.h"
#include <memory>
#include "glog/logging.h"
#include "paddle/pten/api/include/registry.h"
#include "paddle/pten/api/lib/kernel_dispatch.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/include/core.h"
#include "paddle/pten/include/infershape.h"
namespace paddle {
namespace experimental {
PD_DLL_DECL Tensor to(const Tensor& x, Backend backend, bool blocking) {
// 1. Get kernel signature and kernel
auto kernel_key_set = ParseKernelKeyByInputArgs(x);
kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
"copy", kernel_key);
VLOG(0) << "to API kernel key: " << kernel_key;
VLOG(0) << "to API kernel: " << kernel;
// 2. Get Device Context
auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
auto kernel_context = pten::KernelContext(dev_ctx);
// 3. Auto data transform
auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
kernel_context.EmplaceBackInput(dense_x);
kernel_context.EmplaceBackAttr(blocking);
// 4. InferMeta
auto out_meta = UnchangedInferShape(dense_x->meta());
// 5. Prepare outputs
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
pten::TransToFluidPlace(backend));
auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
kernel_context.EmplaceBackOutput(dense_out);
Tensor out;
out.set_impl(dense_out);
// 6. Call kernel
kernel(&kernel_context);
return out;
}
} // namespace experimental
} // namespace paddle
PT_REGISTER_API(Utils);
......@@ -26,7 +26,7 @@ void Flatten(const CPUContext& dev_ctx,
int stop_axis,
DenseTensor* out) {
auto out_dims = out->dims();
pten::Copy(dev_ctx, x, out);
pten::Copy(dev_ctx, x, false, out);
out->Resize(out_dims);
}
......@@ -51,7 +51,7 @@ void ReshapeFromVectorValImpl(const CPUContext& dev_ctx,
bool set_lod) {
auto out_meta = InferShapeFromVecValue(x.meta(), shape);
if (&x != out) {
pten::Copy(dev_ctx, x, out);
pten::Copy(dev_ctx, x, false, out);
}
if (set_lod) {
out->Resize(out_meta.dims, out_meta.lod);
......
......@@ -19,7 +19,11 @@ limitations under the License. */
namespace pten {
void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
// NOTE(chenweihang): blocking is useless in cpu kernel
void Copy(const CPUContext& dev_ctx,
const DenseTensor& src,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
const auto& src_place = src.place();
const auto& dst_place = dst->place();
......
......@@ -23,6 +23,9 @@ namespace pten {
using CPUContext = paddle::platform::CPUDeviceContext;
void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst);
void Copy(const CPUContext& dev_ctx,
const DenseTensor& src,
bool blocking,
DenseTensor* dst);
} // namespace pten
......@@ -22,7 +22,7 @@ namespace pten {
void Copy(const CUDAContext& dev_ctx,
const DenseTensor& src,
bool is_sync,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
const auto& src_place = src.place();
......@@ -97,10 +97,10 @@ void Copy(const CUDAContext& dev_ctx,
src_gpu_place,
ctx_gpu_place));
auto stream =
is_sync ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
paddle::memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cpu_place(src_place) && // NOLINT
......@@ -125,10 +125,10 @@ void Copy(const CUDAContext& dev_ctx,
dst_gpu_place,
ctx_gpu_place));
auto stream =
is_sync ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
......@@ -155,10 +155,10 @@ void Copy(const CUDAContext& dev_ctx,
src_gpu_place.device,
ctx_gpu_place.device));
auto stream =
is_sync ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
paddle::memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
......@@ -185,10 +185,10 @@ void Copy(const CUDAContext& dev_ctx,
dst_gpu_place.device,
ctx_gpu_place.device));
auto stream =
is_sync ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
......@@ -205,10 +205,10 @@ void Copy(const CUDAContext& dev_ctx,
"Context place error, excepted GPUPlace, but actually %s.",
ctx_place));
auto stream =
is_sync ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
blocking ? nullptr
: reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
dev_ctx)
.stream();
if (paddle::platform::is_same_place(src_place, dst_place)) {
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
......
......@@ -28,7 +28,7 @@ using CUDAContext = paddle::platform::CUDADeviceContext;
void Copy(const CUDAContext& dev_ctx,
const DenseTensor& src,
bool is_sync,
bool blocking,
DenseTensor* dst);
} // namespace pten
......
......@@ -26,7 +26,7 @@ void Flatten(const XPUContext& dev_ctx,
int stop_axis,
DenseTensor* out) {
auto out_dims = out->dims();
pten::Copy(dev_ctx, x, out);
pten::Copy(dev_ctx, x, false, out);
out->Resize(out_dims);
}
......@@ -59,7 +59,7 @@ void ReshapeFromVectorVal(const XPUContext& dev_ctx,
out->Resize(out_meta.dims);
return;
}
pten::Copy(dev_ctx, x, out);
pten::Copy(dev_ctx, x, false, out);
out->Resize(out_meta.dims);
}
......
......@@ -21,6 +21,7 @@ namespace pten {
void Copy(const XPUDeviceContext& dev_ctx,
const DenseTensor& src,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
auto* dst_ptr = dst->mutable_data();
......
......@@ -27,6 +27,7 @@ using XPUDeviceContext = paddle::platform::XPUDeviceContext;
void Copy(const XPUDeviceContext& dev_ctx,
const DenseTensor& src,
bool blocking,
DenseTensor* dst);
} // namespace pten
......
if(WITH_ROCM)
hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor glog)
hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api glog)
else()
cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor glog)
cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api glog)
endif()
cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
......@@ -15,4 +15,5 @@ cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_tensor pten_api pten_api_u
cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils)
cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/pten/api/include/utils.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
PT_DECLARE_MODULE(UtilsCPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE(UtilsCUDA);
#endif
namespace pten {
namespace tests {
namespace framework = paddle::framework;
using DDim = paddle::framework::DDim;
paddle::experimental::Tensor CreateInputTensor() {
const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
auto dense_x = std::make_shared<pten::DenseTensor>(
alloc,
pten::DenseTensorMeta(pten::DataType::INT64,
framework::make_ddim({3, 4}),
pten::DataLayout::NCHW));
auto* dense_x_data = dense_x->mutable_data<int64_t>();
for (int64_t i = 0; i < 12; ++i) {
dense_x_data[i] = i;
}
return paddle::experimental::Tensor(dense_x);
}
void CheckOutputResult(const paddle::experimental::Tensor& out) {
ASSERT_EQ(out.dims().size(), 2);
ASSERT_EQ(out.dims()[0], 3);
ASSERT_EQ(out.dims()[1], 4);
ASSERT_EQ(out.is_cpu(), true);
ASSERT_EQ(out.type(), pten::DataType::INT64);
ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
ASSERT_EQ(out.initialized(), true);
for (int64_t i = 0; i < 12; ++i) {
ASSERT_EQ(out.data<int64_t>()[i], i);
}
}
TEST(API, to) {
// 1. create tensor
auto x = CreateInputTensor();
// 2. test API
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto tmp = paddle::experimental::to(x, pten::Backend::CUDA, false);
auto out = paddle::experimental::to(tmp, pten::Backend::CPU, true);
#else
auto out = paddle::experimental::to(x, pten::Backend::CPU, false);
#endif
// 3. check result
CheckOutputResult(out);
}
TEST(Tensor, to) {
// 1. create tensor
auto x = CreateInputTensor();
// 2. test API
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto tmp = x.to(pten::Backend::CUDA, false);
auto out = tmp.to(pten::Backend::CPU, true);
#else
auto out = x.to(pten::Backend::CPU, false);
#endif
// 3. check result
CheckOutputResult(out);
}
TEST(Tensor, copy_to) {
// 1. create tensor
auto x = CreateInputTensor();
// 2. test API
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto tmp = x.copy_to<int64_t>(paddle::PlaceType::kGPU);
auto out = tmp.copy_to<int64_t>(paddle::PlaceType::kCPU);
#else
auto out = x.copy_to<int64_t>(paddle::PlaceType::kCPU);
#endif
// 3. check result
CheckOutputResult(out);
}
} // namespace tests
} // namespace pten
......@@ -54,7 +54,7 @@ TEST(DEV_API, copy) {
// 2. test API
auto& pool = paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(paddle::platform::CPUPlace());
pten::Copy(*dev_ctx, *(dense_src.get()), dense_dst.get());
pten::Copy(*dev_ctx, *(dense_src.get()), false, dense_dst.get());
// 3. check result
for (int64_t i = 0; i < dense_src->numel(); i++) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册