From 5631da9cc2ca284727c512c663e23f426cbcc8cf Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 27 Jan 2022 16:42:24 +0800 Subject: [PATCH] [PTen]Support AllocateFrom in Tensor and Alloc/HostAlloc in Context (#39022) * Support allocate_from in Tensor and allocate_data in Context * fix #ifdef CUDA * fix cycle depends * fix test_xxx_dev_api failed * fix windows compiling error * fix unittest * modify into PImpl * fix selected rows * add TODO comment * refine interface according reviewer --- .../memory/allocation/allocator_facade.cc | 22 +++ .../memory/allocation/allocator_facade.h | 8 + paddle/fluid/platform/device_context.cc | 39 ++++- paddle/pten/core/dense_tensor.cc | 39 +++++ paddle/pten/core/dense_tensor.h | 6 + paddle/pten/core/device_context.cc | 165 +++++++++++++++--- paddle/pten/core/device_context.h | 48 +++-- paddle/pten/core/selected_rows.cc | 6 + paddle/pten/core/selected_rows.h | 4 + paddle/pten/core/tensor_base.h | 11 ++ paddle/pten/kernels/cpu/cast_kernel.cc | 2 +- paddle/pten/kernels/cpu/copy_kernel.cc | 2 +- paddle/pten/kernels/cpu/dot_kernel.cc | 2 +- paddle/pten/kernels/cpu/elementwise.h | 82 ++++----- paddle/pten/kernels/cpu/math_kernel.cc | 4 +- paddle/pten/kernels/cpu/reduce.h | 4 +- paddle/pten/kernels/cpu/scale_kernel.cc | 2 +- paddle/pten/kernels/empty_kernel.cc | 2 +- paddle/pten/kernels/funcs/elementwise_base.h | 8 +- paddle/pten/kernels/funcs/transpose.cc | 2 +- paddle/pten/kernels/funcs/transpose.cu | 2 +- paddle/pten/kernels/gpu/cast_kernel.cu | 2 +- paddle/pten/kernels/gpu/dot_kernel.cu | 2 +- paddle/pten/kernels/gpu/elementwise.h | 58 +++--- paddle/pten/kernels/gpu/math_kernel.cu | 2 +- paddle/pten/kernels/gpu/scale_kernel.cu | 2 +- .../pten/kernels/impl/complex_kernel_impl.h | 2 +- .../pten/kernels/impl/dot_grad_kernel_impl.h | 84 ++++----- paddle/pten/kernels/impl/full_kernel_impl.h | 2 +- .../kernels/impl/matmul_grad_kernel_impl.h | 8 +- paddle/pten/kernels/impl/matmul_kernel_impl.h | 36 ++-- paddle/pten/kernels/impl/sign_kernel_impl.h | 2 +- paddle/pten/kernels/reshape_kernel.cc | 2 +- paddle/pten/kernels/xpu/copy_kernel.cc | 2 +- .../pten/tests/kernels/test_cast_dev_api.cc | 6 + .../pten/tests/kernels/test_concat_dev_api.cc | 5 + .../pten/tests/kernels/test_conj_dev_api.cc | 5 + .../pten/tests/kernels/test_copy_dev_api.cc | 5 + .../tests/kernels/test_creation_dev_api.cc | 17 ++ paddle/pten/tests/kernels/test_dot_dev_api.cc | 5 + .../tests/kernels/test_elementwise_dev_api.cc | 17 ++ .../tests/kernels/test_flatten_dev_api.cc | 5 + .../pten/tests/kernels/test_matmul_dev_api.cc | 5 + .../pten/tests/kernels/test_mean_dev_api.cc | 5 + .../tests/kernels/test_reshape_dev_api.cc | 5 + .../pten/tests/kernels/test_scale_dev_api.cc | 9 + paddle/pten/tests/kernels/test_sum_dev_api.cc | 6 +- .../interpreter/test_standalone_executor.py | 1 + 48 files changed, 554 insertions(+), 206 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 7cdac0de61..0f725a454c 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -840,6 +840,28 @@ void* AllocatorFacade::GetBasePtr( return m_->GetBasePtr(allocation); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +const std::shared_ptr& AllocatorFacade::GetAllocator( + const platform::Place& place, const gpuStream_t& stream) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, + /* A non-zero num to choose allocator_ */ 1); + } +#endif + return m_->GetAllocator(place, stream, /*create_if_not_found=*/true); + } + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); +} +#endif + +const std::shared_ptr& AllocatorFacade::GetZeroAllocator( + const platform::Place& place) { + return m_->GetAllocator(place, /* zero size */ 0); +} + std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { return std::shared_ptr(Alloc(place, size)); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index a9b92e1801..f4aea98003 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -53,6 +53,14 @@ class AllocatorFacade { void* GetBasePtr(const std::shared_ptr& allocation); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + const std::shared_ptr& GetAllocator(const platform::Place& place, + const gpuStream_t& stream); +#endif + + const std::shared_ptr& GetZeroAllocator( + const platform::Place& place); + // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 142e30d161..fdd9883c2c 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -23,6 +23,7 @@ limitations under the License. */ #endif #include "glog/logging.h" #include "paddle/fluid/framework/expect.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -136,11 +137,39 @@ inline void EmplaceDeviceContext( map_ptr, platform::Place p) { using PtrType = std::unique_ptr; - map_ptr->emplace(p, std::async(std::launch::deferred, [=] { - // lazy evaluation. i.e., only create device context at - // first `Get` - return PtrType(new DevCtx(p)); - })); + map_ptr->emplace( + p, std::async(std::launch::deferred, [=] { + // lazy evaluation. i.e., only create device context at + // first `Get` + auto* dev_ctx = new DevCtx(p); + if (is_gpu_place(p)) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto* cuda_ctx = dynamic_cast(dev_ctx); + PADDLE_ENFORCE_NOT_NULL( + cuda_ctx, + platform::errors::InvalidArgument( + "Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); + dev_ctx->SetDeviceAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p, cuda_ctx->context()->RawStream()) + .get()); +#endif + } else { + dev_ctx->SetDeviceAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p) + .get()); + } + dev_ctx->SetHostAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(platform::CPUPlace()) + .get()); + dev_ctx->SetZeroAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(p) + .get()); + return PtrType(dev_ctx); + })); } DeviceContextPool::DeviceContextPool( diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index 15f9f0bda3..7373ba79c0 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -68,6 +68,45 @@ bool DenseTensor::IsSharedWith(const DenseTensor& b) const { return holder_ && holder_ == b.Holder(); } +void* DenseTensor::AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + paddle::platform::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + if (this->dtype() != dtype) { + VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype; + meta_.dtype = dtype; + } + PADDLE_ENFORCE( + valid(), + paddle::platform::errors::PreconditionNotMet( + "The meta data must be valid when call the mutable data function.")); + size_t bytes = numel() * SizeOf(this->dtype()); + if (requested_size) { + PADDLE_ENFORCE_GE(requested_size, + bytes, + paddle::platform::errors::InvalidArgument( + "The reserved size %d should be enough to meet the " + "volume required by metadata %d.", + requested_size, + bytes)); + bytes = requested_size; + } + // TODO(paddle-dev): In case of the allocator of storage_ is different with + // the incoming allocator, we should re-alloc data using the incoming + // allocator. + if (!holder_ || holder_->size() < bytes + meta_.offset) { + meta_.offset = 0; + VLOG(10) << "Allocate data with bytes: " << bytes; + ResetHolder(allocator->Allocate(bytes)); + } + + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + meta_.offset); +} + template const T* DenseTensor::data() const { check_memory_size(); diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 2823441f97..fbecbcf0a1 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -124,6 +124,12 @@ class DenseTensor : public TensorBase, /// return Whether the storage is allocated. bool initialized() const override { return holder_ && holder_->ptr(); } + /// \brief Allocate memory with requested size from allocator. + /// \return The mutable data pointer value of type T. + void* AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size = 0) override; + /// \brief Check if storage is shared with other objects. /// \return Whether the storage is shared with other objects. bool IsSharedWith(const DenseTensor& b) const; diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc index 7566b351bf..d6e01c5c6e 100644 --- a/paddle/pten/core/device_context.cc +++ b/paddle/pten/core/device_context.cc @@ -13,45 +13,119 @@ // limitations under the License. #include "paddle/pten/core/device_context.h" -#include "paddle/pten/api/ext/exception.h" +#include "paddle/pten/core/enforce.h" +#include "paddle/pten/core/tensor_base.h" namespace pten { +using DataType = paddle::experimental::DataType; struct DeviceContext::Impl { Impl() = default; ~Impl() = default; - void SetDeviceAllocator(Allocator* allocator) { + void SetDeviceAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + pten::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); device_allocator_ = allocator; } - void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; } + void SetHostAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + pten::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + host_allocator_ = allocator; + } + + void SetZeroAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + pten::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + zero_allocator_ = allocator; + } const Allocator& GetDeviceAllocator() const { - PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr."); + PADDLE_ENFORCE_NOT_NULL( + device_allocator_, + pten::errors::InvalidArgument("Required device_allocator_ shall not be " + "nullptr, but received nullptr.")); return *device_allocator_; } const Allocator& GetHostAllocator() const { - PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr."); + PADDLE_ENFORCE_NOT_NULL( + host_allocator_, + pten::errors::InvalidArgument("Required host_allocator_ shall not be " + "nullptr, but received nullptr.")); return *host_allocator_; } - // TODO(Wilber): Add impl. It seems that tensorbase not have interface to - // communicate with allocator. - void HostAlloc(TensorBase* tensor) {} - void DeviceAlloc(TensorBase* tensor) {} + const Allocator& GetZeroAllocator() const { + PADDLE_ENFORCE_NOT_NULL( + zero_allocator_, + pten::errors::InvalidArgument("Required host_allocator_ shall not be " + "nullptr, but received nullptr.")); + return *zero_allocator_; + } + + void* Alloc(TensorBase* tensor, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const { + PADDLE_ENFORCE_NOT_NULL( + tensor, + pten::errors::InvalidArgument( + "Required tensor shall not be nullptr, but received nullptr.")); + if (dtype == DataType::UNDEFINED) { + dtype = tensor->dtype(); + } + auto* allocator = + tensor->numel() == 0 ? zero_allocator_ : device_allocator_; + return tensor->AllocateFrom( + const_cast(allocator), dtype, requested_size); + } + + template + T* Alloc(TensorBase* tensor, size_t requested_size = 0) const { + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); + return static_cast(Alloc(tensor, dtype, requested_size)); + } - Allocator* device_allocator_{nullptr}; - Allocator* host_allocator_{nullptr}; + void* HostAlloc(TensorBase* tensor, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const { + PADDLE_ENFORCE_NOT_NULL( + tensor, + pten::errors::InvalidArgument( + "Required tensor shall not be nullptr, but received nullptr.")); + if (dtype == DataType::UNDEFINED) { + dtype = tensor->dtype(); + } + auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_; + return tensor->AllocateFrom( + const_cast(allocator), dtype, requested_size); + } + + template + T* HostAlloc(pten::TensorBase* tensor, size_t requested_size = 0) const { + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); + return static_cast(HostAlloc(tensor, dtype, requested_size)); + } + + private: + const Allocator* device_allocator_{nullptr}; + const Allocator* host_allocator_{nullptr}; + const Allocator* zero_allocator_{nullptr}; }; DeviceContext::DeviceContext() { impl_ = std::make_unique(); } DeviceContext::DeviceContext(const DeviceContext& other) { - impl_->SetDeviceAllocator( - const_cast(&other.GetDeviceAllocator())); - impl_->SetHostAllocator(const_cast(&other.GetHostAllocator())); + impl_->SetHostAllocator(&other.GetHostAllocator()); + impl_->SetDeviceAllocator(&other.GetDeviceAllocator()); + impl_->SetZeroAllocator(&other.GetZeroAllocator()); } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -60,26 +134,71 @@ DeviceContext::DeviceContext(DeviceContext&& other) { DeviceContext::~DeviceContext() = default; -void DeviceContext::SetHostAllocator(Allocator* allocator) { - impl_->SetHostAllocator(allocator); +void DeviceContext::SetDeviceAllocator(const Allocator* allocator) { + impl_->SetDeviceAllocator(allocator); } -void DeviceContext::SetDeviceAllocator(Allocator* allocator) { - impl_->SetDeviceAllocator(allocator); +const Allocator& DeviceContext::GetDeviceAllocator() const { + return impl_->GetDeviceAllocator(); +} + +void DeviceContext::SetHostAllocator(const Allocator* allocator) { + impl_->SetHostAllocator(allocator); } const Allocator& DeviceContext::GetHostAllocator() const { return impl_->GetHostAllocator(); } -const Allocator& DeviceContext::GetDeviceAllocator() const { - return impl_->GetDeviceAllocator(); +void DeviceContext::SetZeroAllocator(const Allocator* allocator) { + impl_->SetZeroAllocator(allocator); } -void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); } +const Allocator& DeviceContext::GetZeroAllocator() const { + return impl_->GetZeroAllocator(); +} -void DeviceContext::DeviceAlloc(TensorBase* tensor) { - impl_->DeviceAlloc(tensor); +void* DeviceContext::Alloc(TensorBase* tensor, + DataType dtype, + size_t requested_size) const { + return impl_->Alloc(tensor, dtype, requested_size); } +template +T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const { + return impl_->Alloc(tensor, requested_size); +} + +void* DeviceContext::HostAlloc(TensorBase* tensor, + DataType dtype, + size_t requested_size) const { + return impl_->HostAlloc(tensor, dtype, requested_size); +} + +template +T* DeviceContext::HostAlloc(TensorBase* tensor, size_t requested_size) const { + return impl_->HostAlloc(tensor, requested_size); +} + +#define DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(dtype) \ + template dtype* DeviceContext::Alloc(TensorBase* tensor, \ + size_t requested_size) const; \ + template dtype* DeviceContext::HostAlloc(TensorBase* tensor, \ + size_t requested_size) const; + +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(bool) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int8_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(uint8_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int16_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int32_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int64_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(float) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(double) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::bfloat16) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::float16) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128) + +#undef DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION + } // namespace pten diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h index c658a24c35..30be5cd22d 100644 --- a/paddle/pten/core/device_context.h +++ b/paddle/pten/core/device_context.h @@ -19,6 +19,7 @@ limitations under the License. */ // TODO(wilber): Do we need to use place in pten kernel? #include "paddle/pten/common/place.h" +#include "paddle/pten/common/data_type.h" #include "paddle/pten/core/allocator.h" namespace pten { @@ -31,6 +32,8 @@ class TensorBase; * DeviceContext. */ class DeviceContext { + using DataType = paddle::experimental::DataType; + public: /** * @brief Default construct. @@ -53,42 +56,61 @@ class DeviceContext { virtual ~DeviceContext(); /** - * @brief Set the deveice-releated Allocator object. + * @brief Set the device-related Allocator object. * * @param allocator */ - void SetDeviceAllocator(Allocator*); + void SetDeviceAllocator(const Allocator*); /** - * @brief Get the const deveice-releated Allocator object. + * @brief Set the host Allocator object. * - * @return Allocator + * @param allocator */ - const Allocator& GetDeviceAllocator() const; + void SetHostAllocator(const Allocator*); /** - * @brief Allocate device memory for tensor. - */ - void DeviceAlloc(pten::TensorBase*); + * @brief Set the zero-size Allocator object. + * + * @param allocator + */ + void SetZeroAllocator(const Allocator*); /** - * @brief Set the host Allocator object. + * @brief Get the const Allocator object. * - * @param allocator + * @return Allocator */ - void SetHostAllocator(Allocator*); + const Allocator& GetDeviceAllocator() const; /** - * @brief Get the const host Allocator object. + * @brief Get the const device-related Allocator object. * * @return Allocator */ const Allocator& GetHostAllocator() const; + const Allocator& GetZeroAllocator() const; + + /** + * @brief Allocate device memory for tensor. + */ + void* Alloc(TensorBase*, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const; + + template + T* Alloc(TensorBase* tensor, size_t requested_size = 0) const; + /** * @brief Allocate host memory for tensor. */ - void HostAlloc(pten::TensorBase*); + void* HostAlloc(TensorBase* tensor, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const; + + template + T* HostAlloc(TensorBase* tensor, size_t requested_size = 0) const; // TODO(wilber): Just for the convenience of migrating the code, it will be // modified or removed later. diff --git a/paddle/pten/core/selected_rows.cc b/paddle/pten/core/selected_rows.cc index 1dfcfa4934..7578faf614 100644 --- a/paddle/pten/core/selected_rows.cc +++ b/paddle/pten/core/selected_rows.cc @@ -91,6 +91,12 @@ struct TensorFillVisitor { int64_t size_; }; +void* SelectedRows::AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size) { + return value_->AllocateFrom(allocator, dtype, requested_size); +} + bool SelectedRows::HasKey(int64_t key) const { return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false : true; diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h index e12f59d02f..2f224e42ea 100644 --- a/paddle/pten/core/selected_rows.h +++ b/paddle/pten/core/selected_rows.h @@ -113,6 +113,10 @@ class SelectedRows : public TensorBase, bool auto_grown = false, bool is_test = false); + void* AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size = 0) override; + /* * @brief Get the index of the key from id_to_index_ map. If the key not * exist, diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h index 662553cbcb..7a5e42da49 100644 --- a/paddle/pten/core/tensor_base.h +++ b/paddle/pten/core/tensor_base.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/common/backend.h" #include "paddle/pten/common/data_type.h" #include "paddle/pten/common/layout.h" +#include "paddle/pten/core/allocator.h" #include "paddle/pten/core/ddim.h" #include "paddle/pten/core/storage.h" #include "paddle/pten/core/utils/type_registry.h" @@ -61,6 +62,16 @@ class TensorBase { /// return Whether the storage is allocated. virtual bool initialized() const = 0; + // TODO(Aurelius84): This interface is under intermediate state now. + // We will remove DataType argument in the future. Please DO NOT + // rely on Datatype to much when design and implement other feature. + + /// \brief Allocate memory with requested size from allocator. + /// \return The mutable data pointer value of type T. + virtual void* AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size = 0) = 0; + /// \brief Return the type information of the derived class to support /// safely downcast in non-rtti environment. /// return The type information of the derived class. diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc index edb8f59e26..24371ca769 100644 --- a/paddle/pten/kernels/cpu/cast_kernel.cc +++ b/paddle/pten/kernels/cpu/cast_kernel.cc @@ -36,7 +36,7 @@ void CastKernelImpl(const CPUContext& dev_ctx, auto numel = x.numel(); auto* in_end = in_begin + numel; - auto* out_begin = out->mutable_data(dev_ctx.GetPlace()); + auto* out_begin = dev_ctx.Alloc(out); paddle::platform::Transform trans; trans(dev_ctx, diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc index be5170f4d0..0892e3974f 100644 --- a/paddle/pten/kernels/cpu/copy_kernel.cc +++ b/paddle/pten/kernels/cpu/copy_kernel.cc @@ -37,7 +37,7 @@ void Copy(const Context& dev_ctx, << src_place; dst->Resize(src.dims()); - auto* dst_ptr = dst->mutable_data(src_place); + auto* dst_ptr = dev_ctx.Alloc(dst); if (src_ptr == dst_ptr) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc index e6ffd3b500..5cef8d0bdd 100644 --- a/paddle/pten/kernels/cpu/dot_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_kernel.cc @@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx, DenseTensor* out) { auto const *x_ptr = x.data(), *x_ptr_ = &x_ptr[0]; auto const *y_ptr = y.data(), *y_ptr_ = &y_ptr[0]; - auto* z = out->mutable_data(dev_ctx.GetPlace()); + T* z = dev_ctx.template Alloc(out); // Loop over the total N elements of both operands while sum-reducing every // B pairs along the way where B is the dimension of the least ordered axis diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h index 179a188118..2d717414d7 100644 --- a/paddle/pten/kernels/cpu/elementwise.h +++ b/paddle/pten/kernels/cpu/elementwise.h @@ -45,10 +45,8 @@ struct SameDimsAddFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VADD(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VADD( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -61,7 +59,7 @@ struct SameDimsAddFunctor< const DenseTensor& x, const DenseTensor& y, DenseTensor* z) { - z->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(z); auto eigen_x = pten::EigenVector::Flatten(x); auto eigen_y = pten::EigenVector::Flatten(y); auto eigen_z = pten::EigenVector::Flatten(*z); @@ -89,10 +87,8 @@ struct SameDimsSubtractFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VSUB(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VSUB( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -147,10 +143,8 @@ struct SameDimsDivideFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VDIV(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VDIV( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -173,10 +167,8 @@ struct SameDimsMultiplyFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VMUL(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VMUL( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -241,8 +233,8 @@ void CommonGradBroadcastCPU(const DenseTensor& x, const T* y_data = y.data(); const Tout* out_data = out.data(); const Tout* dout_data = dout.data(); - T* dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); - T* dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); + T* dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); + T* dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); if (dx_data != nullptr) { memset(dx_data, 0, dx->numel() * sizeof(T)); } @@ -292,7 +284,7 @@ void CommonForwardBroadcastCPU(const DenseTensor& x, PADDLE_ENFORCE_NOT_NULL(y_data, paddle::platform::errors::InvalidArgument( "The input Y should not be empty.")); - OutType* out_data = z->mutable_data(ctx.GetPlace()); + OutType* out_data = ctx.Alloc(z); const int out_size = std::accumulate( out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); @@ -373,7 +365,7 @@ void ElementwiseCompute(const CPUContext& dev_ctx, int axis, Functor func, DenseTensor* z) { - z->mutable_data(dev_ctx.GetPlace()); + dev_ctx.Alloc(z); auto x_dims = x.dims(); auto y_dims = y.dims(); bool is_xsize_larger = true; @@ -677,32 +669,30 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx, return; } if (post == 1) { - ElemwiseGradBroadcast1CPU( - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast1CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } else { - ElemwiseGradBroadcast2CPU( - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - post, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast2CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } } diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index 6d76626605..d4987e7a36 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -37,7 +37,7 @@ namespace pten { const DenseTensor& y, \ int axis, \ DenseTensor* out) { \ - out->mutable_data(dev_ctx.GetPlace()); \ + dev_ctx.template Alloc(out); \ if (x.dims() == y.dims()) { \ SameDimsElementwiseCompute>()( \ dev_ctx, x, y, out); \ @@ -85,7 +85,7 @@ void DivideRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { // allocate memory for out - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); if (x.dims() == y.dims() && std::is_floating_point::value) { SameDimsElementwiseCompute>()( dev_ctx, x, y, out); diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h index 8f84bd0515..2b0659ac2e 100644 --- a/paddle/pten/kernels/cpu/reduce.h +++ b/paddle/pten/kernels/cpu/reduce.h @@ -119,7 +119,7 @@ void GetShuffledInput(const DeviceContext& dev_ctx, GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis); shuffled_input->ResizeAndAllocate(shuffled_dims); - shuffled_input->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(shuffled_input); pten::math::TransposeNormal trans; trans(dev_ctx, input, shuffled_input, perm_axis); @@ -158,7 +158,7 @@ void ReduceKernelImpl(const DeviceContext& dev_ctx, const std::vector& dims, bool keep_dim, bool reduce_all) { - output->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(output); if (reduce_all) { // Flatten and reduce 1-D tensor diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc index 774d3891b0..4f999ac4d1 100644 --- a/paddle/pten/kernels/cpu/scale_kernel.cc +++ b/paddle/pten/kernels/cpu/scale_kernel.cc @@ -33,7 +33,7 @@ void ScaleKernel(const Context& dev_ctx, bool bias_after_scale, DenseTensor* out) { // calc - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); auto eigen_out = pten::EigenVector::Flatten(*out); auto eigen_x = pten::EigenVector::Flatten(x); auto& dev = *dev_ctx.eigen_device(); diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index 6ce4998287..ecb058d35b 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -29,7 +29,7 @@ void EmptyKernel(const Context& dev_ctx, template void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); } } // namespace pten diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index 9ea27fd9c5..3f1651eeb2 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -229,7 +229,7 @@ class TransformFunctor { const bool is_xsize_larger = true) : x_(x.data()), y_(y.data()), - z_(z->mutable_data(ctx.GetPlace())), + z_(ctx.template Alloc(z)), nx_(x.numel()), ctx_(ctx), func_(func), @@ -425,8 +425,8 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, dout.data(), dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(dev_ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(dev_ctx.GetPlace())}); + dx == nullptr ? nullptr : dev_ctx.template Alloc(dx), + dy == nullptr ? nullptr : dev_ctx.template Alloc(dy)}); } inline void ElementwiseGradPreProcess(const DenseTensor &dout, @@ -631,7 +631,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ins_data[i] = ins[i]->data(); } for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = (*outs)[i]->mutable_data(ctx.GetPlace()); + outs_data[i] = ctx.Alloc((*outs)[i]); } #ifdef PADDLE_WITH_XPU2 int block_size = 64; diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc index 13cfaedb33..7d4dc3c7ce 100644 --- a/paddle/pten/kernels/funcs/transpose.cc +++ b/paddle/pten/kernels/funcs/transpose.cc @@ -36,7 +36,7 @@ struct TransposeNormal { auto in_stride = pten::framework::stride(in.dims()); auto out_stride = pten::framework::stride(out->dims()); const T* in_ptr = in.data(); - T* out_ptr = out->mutable_data(dev_ctx.GetPlace()); + T* out_ptr = dev_ctx.template Alloc(out); auto transpose_helper = [&](int64_t beg, int64_t end) { for (int64_t out_idx = beg; out_idx < end; ++out_idx) { diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu index 24d72ca3d8..a7b7184487 100644 --- a/paddle/pten/kernels/funcs/transpose.cu +++ b/paddle/pten/kernels/funcs/transpose.cu @@ -61,7 +61,7 @@ struct TransposeNormal { auto in_stride = pten::framework::stride(in.dims()); auto out_stride = pten::framework::stride(out->dims()); auto* in_ptr = in.data(); - auto* out_ptr = out->mutable_data(dev_ctx.GetPlace()); + T* out_ptr = dev_ctx.template Alloc(out); // copy in_stride, out_stride, axis to gpu device const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace(); diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index 12f246c323..81d09ef164 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -43,7 +43,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx, std::vector outputs; inputs.emplace_back(&x); outputs.emplace_back(out); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.Alloc(out); pten::funcs::LaunchSameDimsElementwiseCudaKernel( diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu index 75aacc8d3d..24bd034fb1 100644 --- a/paddle/pten/kernels/gpu/dot_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_kernel.cu @@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, DenseTensor* out) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); if (1 == out->dims().size()) { auto eigen_out = pten::EigenScalar::From(*out); auto eigen_x = pten::EigenVector::Flatten(x); diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index 9a3ae7f12d..6f744212cd 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -352,7 +352,7 @@ void LaunchKernel(const KPDevice &ctx, pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data; for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = (*outs)[i]->mutable_data(ctx.GetPlace()); + outs_data[i] = ctx.Alloc((*outs)[i]); } for (int i = 0; i < Arity; i++) { @@ -1264,8 +1264,8 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, const T *y_data = y.data(); const Tout *out_data = out.data(); const Tout *dout_data = dout.data(); - T *dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); - T *dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); + T *dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); + T *dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); std::vector x_one_indexs; std::vector y_one_indexs; @@ -1923,34 +1923,32 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, return; } if (post == 1) { - ElemwiseGradBroadcast1CUDA( - ctx.stream(), - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast1CUDA(ctx.stream(), + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } else { - ElemwiseGradBroadcast2CUDA( - ctx.stream(), - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - post, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast2CUDA(ctx.stream(), + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } } diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 996d85d3f4..37104c46a4 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -47,7 +47,7 @@ namespace pten { inputs.emplace_back(&x); \ inputs.emplace_back(&y); \ outputs.emplace_back(out); \ - out->mutable_data(dev_ctx.GetPlace()); \ + dev_ctx.template Alloc(out); \ LaunchElementwiseCudaKernel( \ dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ } diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index dd7c2f242e..5aba001267 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -54,7 +54,7 @@ void ScaleKernel(const Context& dev_ctx, std::vector outputs; inputs.emplace_back(&x); outputs.emplace_back(out); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); pten::funcs::LaunchSameDimsElementwiseCudaKernel( diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h index aa878f7e9e..7e4c4f0d66 100644 --- a/paddle/pten/kernels/impl/complex_kernel_impl.h +++ b/paddle/pten/kernels/impl/complex_kernel_impl.h @@ -26,7 +26,7 @@ void ConjKernel(const Context& dev_ctx, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - auto* out_data = out->mutable_data(dev_ctx.GetPlace()); + auto* out_data = dev_ctx.template Alloc(out); paddle::platform::ForRange for_range(dev_ctx, numel); paddle::operators::math::ConjFunctor functor(x_data, numel, out_data); diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h index d0c6cf6793..d4ea9fc944 100644 --- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h @@ -73,7 +73,7 @@ struct DotGradFunction::From(*tensor_dout); if (tensor_dx) { - tensor_dx->mutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dx); auto y = EigenMatrix::From(*tensor_y); auto& dev = *ctx.eigen_device(); Eigen::DSizes size(1, tensor_dx->dims()[1]); @@ -85,7 +85,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dy); auto x = EigenMatrix::From(*tensor_x); auto& dev = *ctx.eigen_device(); Eigen::DSizes size(1, tensor_dy->dims()[1]); @@ -100,7 +100,7 @@ struct DotGradFunctiondata(); if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); + auto* data_dx = ctx.template Alloc(tensor_dx); const auto* data_y = tensor_y->data(); const DDim& dim = tensor_x->dims(); size_t N = static_cast(pten::framework::product(dim)); @@ -115,7 +115,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + auto* data_dy = ctx.template Alloc(tensor_dy); const auto* data_x = tensor_x->data(); const DDim& dim = tensor_y->dims(); size_t N = static_cast(pten::framework::product(dim)); @@ -164,7 +164,7 @@ struct DotGradFunction::From(*tensor_dout); if (tensor_dx) { - tensor_dx->mutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dx); auto y = EigenMatrix::From(*tensor_y); auto dx = EigenMatrix::From(*tensor_dx); auto& dev = *ctx.eigen_device(); @@ -173,7 +173,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dy); auto x = EigenMatrix::From(*tensor_x); auto dy = EigenMatrix::From(*tensor_dy); auto& dev = *ctx.eigen_device(); @@ -189,7 +189,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + auto* dx = ctx.template Alloc(tensor_dx); for (auto j = 0; j < N / B; ++j) { auto const ss = dz[j]; for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss; @@ -197,7 +197,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + auto* dy = ctx.template Alloc(tensor_dy); for (auto j = 0; j < N / B; ++j) { auto const ss = dz[j]; for (auto i = 0; i < B; i++) *dy++ = *x++ * ss; @@ -272,7 +272,7 @@ struct DotDoubleGradFunctiondata(); if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); + auto* data_dx = ctx.template Alloc(tensor_dx); const auto* data_ddy = tensor_ddy->data(); const DDim& dim = tensor_dx->dims(); size_t N = static_cast(product(dim)); @@ -287,7 +287,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_dy = ctx.template Alloc(tensor_dy); const auto* data_ddx = tensor_ddx->data(); const DDim& dim = tensor_dy->dims(); size_t N = static_cast(product(dim)); @@ -302,7 +302,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_ddout = ctx.template Alloc(tensor_ddout); auto* data_x = tensor_x->data(); auto* data_y = tensor_y->data(); auto* data_ddx = tensor_ddx->data(); @@ -351,7 +351,7 @@ struct DotDoubleGradFunction::Flatten(*tensor_dout); if (tensor_dx) { - tensor_dx->mutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dx); auto ddy = EigenVector::Flatten(*tensor_ddy); Eigen::DSizes size(tensor_ddy->numel()); auto dx = EigenVector::Flatten(*tensor_dx); @@ -359,7 +359,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dy); auto ddx = EigenVector::Flatten(*tensor_ddx); Eigen::DSizes size(tensor_ddx->numel()); @@ -368,7 +368,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_ddout); auto x = EigenVector::Flatten(*tensor_x); auto y = EigenVector::Flatten(*tensor_y); auto ddx = EigenVector::Flatten(*tensor_ddx); @@ -381,7 +381,7 @@ struct DotDoubleGradFunctiondata(); if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); + auto* data_dx = ctx.template Alloc(tensor_dx); const auto* data_ddy = tensor_ddy->data(); const DDim& dim = tensor_dx->dims(); size_t N = static_cast(product(dim)); @@ -396,7 +396,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_dy = ctx.template Alloc(tensor_dy); const auto* data_ddx = tensor_ddx->data(); const DDim& dim = tensor_dy->dims(); size_t N = static_cast(product(dim)); @@ -411,7 +411,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_ddout = ctx.template Alloc(tensor_ddout); auto* data_x = tensor_x->data(); auto* data_y = tensor_y->data(); auto* data_ddx = tensor_ddx->data(); @@ -552,7 +552,7 @@ struct DotTripleGradFunctiondata(); if (out_tensor_d_x) { - auto* data_d_x = out_tensor_d_x->mutable_data(ctx.GetPlace()); + auto* data_d_x = ctx.template Alloc(out_tensor_d_x); const auto* data_ddy = in_tensor_ddy->data(); const DDim& dim = out_tensor_d_x->dims(); @@ -567,7 +567,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_y = ctx.template Alloc(out_tensor_d_y); const auto* data_ddx = in_tensor_ddx->data(); const DDim& dim = out_tensor_d_y->dims(); @@ -582,7 +582,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); auto* data_ddx = in_tensor_ddx->data(); auto* data_ddy = in_tensor_ddy->data(); auto* data_d_dx = in_tensor_d_dx->data(); @@ -613,7 +613,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); auto* data_dout = in_tensor_dout->data(); auto* data_d_dy = in_tensor_d_dy->data(); auto* data_y = in_tensor_y->data(); @@ -633,7 +633,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); auto* data_dout = in_tensor_dout->data(); auto* data_d_dx = in_tensor_d_dx->data(); auto* data_x = in_tensor_x->data(); @@ -678,7 +678,7 @@ struct DotTripleGradFunction::Flatten(*in_tensor_d_ddout); if (out_tensor_d_x) { - out_tensor_d_x->mutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_x); auto ddy = EigenVector::Flatten(*in_tensor_ddy); Eigen::DSizes size(in_tensor_ddy->numel()); auto d_x = EigenVector::Flatten(*out_tensor_d_x); @@ -686,7 +686,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_y); auto ddx = EigenVector::Flatten(*in_tensor_ddx); Eigen::DSizes size(in_tensor_ddx->numel()); @@ -695,7 +695,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_dout); auto ddx = EigenVector::Flatten(*in_tensor_ddx); auto ddy = EigenVector::Flatten(*in_tensor_ddy); auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); @@ -705,7 +705,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_ddx); auto dout = EigenVector::Flatten(*in_tensor_dout); auto y = EigenVector::Flatten(*in_tensor_y); auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); @@ -717,7 +717,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_ddy); auto dout = EigenVector::Flatten(*in_tensor_dout); auto x = EigenVector::Flatten(*in_tensor_x); auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); @@ -732,7 +732,7 @@ struct DotTripleGradFunctiondata(); if (out_tensor_d_x) { - auto* data_d_x = out_tensor_d_x->mutable_data(ctx.GetPlace()); + auto* data_d_x = ctx.template Alloc(out_tensor_d_x); const auto* data_ddy = in_tensor_ddy->data(); const DDim& dim = out_tensor_d_x->dims(); @@ -747,7 +747,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_y = ctx.template Alloc(out_tensor_d_y); const auto* data_ddx = in_tensor_ddx->data(); const DDim& dim = out_tensor_d_y->dims(); @@ -762,7 +762,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); auto* data_ddx = in_tensor_ddx->data(); auto* data_ddy = in_tensor_ddy->data(); auto* data_d_dx = in_tensor_d_dx->data(); @@ -790,7 +790,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); auto* data_dout = in_tensor_dout->data(); auto* data_d_dy = in_tensor_d_dy->data(); auto* data_y = in_tensor_y->data(); @@ -809,7 +809,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); auto* data_dout = in_tensor_dout->data(); auto* data_d_dx = in_tensor_d_dx->data(); auto* data_x = in_tensor_x->data(); @@ -838,10 +838,10 @@ void DotGradKernel(const Context& dev_ctx, DenseTensor* dx, DenseTensor* dy) { if (dx) { - dx->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dx); } if (dy) { - dy->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dy); } DotGradFunction()(dev_ctx, &x, &y, &dout, dx, dy); } @@ -857,13 +857,13 @@ void DotDoubleGradKernel(const Context& dev_ctx, DenseTensor* dy, DenseTensor* ddout) { if (dx) { - dx->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dx); } if (dy) { - dy->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dy); } if (ddout) { - ddout->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(ddout); } DotDoubleGradFunction()( dev_ctx, &x, &y, &dout, ddx, ddy, dx, dy, ddout); @@ -885,19 +885,19 @@ void DotTripleGradKernel(const Context& dev_ctx, DenseTensor* d_ddy, DenseTensor* d_dout) { if (d_x) { - d_x->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_x); } if (d_y) { - d_y->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_y); } if (d_ddx) { - d_ddx->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_ddx); } if (d_ddy) { - d_ddy->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_ddy); } if (d_dout) { - d_dout->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_dout); } DotTripleGradFunction()(dev_ctx, diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h index 4fee23e175..4fbe9f34e5 100644 --- a/paddle/pten/kernels/impl/full_kernel_impl.h +++ b/paddle/pten/kernels/impl/full_kernel_impl.h @@ -26,7 +26,7 @@ namespace pten { template void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) { - tensor->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(tensor); auto t = pten::EigenVector::Flatten(*tensor); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(val)); } diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index fbcb073150..87785a2b47 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -105,7 +105,7 @@ void MatMul(const Context& dev_ctx, bool trans_b, DenseTensor* out, bool flag = false) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); auto blas = paddle::operators::math::GetBlas(dev_ctx); auto mat_dim_a = paddle::operators::math::CreateMatrixDescriptor(a.dims(), 0, trans_a); @@ -123,7 +123,7 @@ void MatMul(const Context& dev_ctx, b.data(), mat_dim_b, static_cast(1), - out->data(), + dev_ctx.template Alloc(out), static_cast(flag)); } @@ -242,8 +242,8 @@ void MatmulGradKernel(const Context& dev_ctx, // Case1 : x's or y's dim = 1 if (x_ndim == 1 && y_ndim == 1) { - if (dx) dx->mutable_data(dev_ctx.GetPlace()); - if (dy) dy->mutable_data(dev_ctx.GetPlace()); + if (dx) dev_ctx.template Alloc(dx); + if (dy) dev_ctx.template Alloc(dy); if (out_grad.numel() == 1) { DotGradFunction()(dev_ctx, &x, &y, &out_grad, dx, dy); return; diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h index e59a54c703..858807a1d4 100644 --- a/paddle/pten/kernels/impl/matmul_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h @@ -118,7 +118,7 @@ void MatMulFunction(const Context& dev_ctx, N)); VLOG(3) << "MatMul's case 1"; Out->Resize({1}); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); blas.GEMM(CblasNoTrans, CblasTrans, 1, @@ -128,7 +128,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); return; } @@ -165,7 +165,7 @@ void MatMulFunction(const Context& dev_ctx, out_dims.back() = y_dims.back(); } Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims)); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); if (trans_y) { const int M = Y.numel() / N; VLOG(3) << "MatMul's case 2"; @@ -176,7 +176,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { const int M = y_dims[y_ndim - 1]; const int batch_size = Y.numel() / (M * N); @@ -189,7 +189,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 4"; blas.BatchedGEMM(CblasTrans, @@ -201,7 +201,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), batch_size, M * N, 0); @@ -243,7 +243,7 @@ void MatMulFunction(const Context& dev_ctx, std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin()); } Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims)); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); if (trans_x) { const int M = x_dims[x_ndim - 1]; @@ -257,7 +257,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 6"; blas.BatchedGEMM(CblasTrans, @@ -269,7 +269,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), batch_size, M * N, 0); @@ -284,7 +284,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } return; } @@ -331,7 +331,7 @@ void MatMulFunction(const Context& dev_ctx, out_broadcast_dims[ndim - 1] = N; Out->ResizeAndAllocate(pten::framework::make_ddim(out_broadcast_dims)); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); const int batch_dim = ndim - 2; // broadcast message @@ -367,7 +367,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else if (x_batch_size == 1) { if (M == 1 && trans_y) { VLOG(3) << "MatMul's case 9"; @@ -378,7 +378,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 10"; blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, @@ -390,7 +390,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), out_batch_size, 0, K * N); @@ -407,7 +407,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 12"; blas.BatchedGEMM(CblasTrans, @@ -419,7 +419,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), out_batch_size, M * K, 0); @@ -435,7 +435,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), out_batch_size, M * K, K * N); @@ -454,7 +454,7 @@ void MatMulFunction(const Context& dev_ctx, x_ptr[i] = x_data + x_index * M * K; y_ptr[i] = y_data + y_index * K * N; - out_ptr[i] = Out->data() + i * M * N; + out_ptr[i] = dev_ctx.template Alloc(Out) + i * M * N; IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data()); } VLOG(3) << "MatMul's case 14"; diff --git a/paddle/pten/kernels/impl/sign_kernel_impl.h b/paddle/pten/kernels/impl/sign_kernel_impl.h index 54c1464c9e..87efacccc9 100644 --- a/paddle/pten/kernels/impl/sign_kernel_impl.h +++ b/paddle/pten/kernels/impl/sign_kernel_impl.h @@ -26,7 +26,7 @@ template void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); auto eigen_out = pten::EigenVector::Flatten(*out); auto eigen_x = pten::EigenVector::Flatten(x); diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc index 4b706e9e68..a76dfb09a0 100644 --- a/paddle/pten/kernels/reshape_kernel.cc +++ b/paddle/pten/kernels/reshape_kernel.cc @@ -32,7 +32,7 @@ void ReshapeKernel(const Context& dev_ctx, return; } out->set_meta(out_meta); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.Alloc(out); pten::Copy(dev_ctx, x, false, out); out->Resize(out_meta.dims); out->ResetLoD(x.lod()); diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc index 56b79061f7..56ad19f0cc 100644 --- a/paddle/pten/kernels/xpu/copy_kernel.cc +++ b/paddle/pten/kernels/xpu/copy_kernel.cc @@ -30,7 +30,7 @@ void Copy(const Context& dev_ctx, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); - auto* dst_ptr = dst->mutable_data(dev_ctx.GetPlace()); + auto* dst_ptr = dev_ctx.Alloc(dst); const auto& src_place = src.place(); const auto& dst_place = dst->place(); diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc index c9d376b81a..33d27ca5b1 100644 --- a/paddle/pten/tests/kernels/test_cast_dev_api.cc +++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/cast_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/common/data_type.h" #include "paddle/pten/core/dense_tensor.h" @@ -48,6 +49,11 @@ TEST(DEV_API, cast) { } pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + pten::DataType out_dtype = pten::DataType::FLOAT64; // 2. test API auto out = pten::Cast(dev_ctx, dense_x, out_dtype); diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc index 6f9ea1b0d9..eb546e992e 100644 --- a/paddle/pten/tests/kernels/test_concat_dev_api.cc +++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/concat_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -56,6 +57,10 @@ TEST(DEV_API, concat) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Concat(dev_ctx, inputs, 0); // 3. check result diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc index 6714b57105..e43769dfb2 100644 --- a/paddle/pten/tests/kernels/test_conj_dev_api.cc +++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/complex_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -44,6 +45,10 @@ TEST(DEV_API, conj) { } pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Conj(dev_ctx, dense_x); diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc index 01dfa925d6..29f68513fa 100644 --- a/paddle/pten/tests/kernels/test_copy_dev_api.cc +++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/copy_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" @@ -57,6 +58,10 @@ TEST(DEV_API, copy) { std::cout << typeid(a).name() << std::endl; // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get()); // 3. check result diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc index 17416d3347..8b37c41d0b 100644 --- a/paddle/pten/tests/kernels/test_creation_dev_api.cc +++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/pten/kernels/empty_kernel.h" #include "paddle/pten/kernels/full_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -32,6 +33,10 @@ using DDim = pten::framework::DDim; TEST(DEV_API, empty) { // 1. create input pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Empty(dev_ctx, {3, 2}, pten::DataType::INT32); @@ -58,6 +63,10 @@ TEST(DEV_API, empty_like) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::EmptyLike(dev_ctx, dense_x); // 3. check result @@ -74,6 +83,10 @@ TEST(DEV_API, full) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Full(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32); // 3. check result @@ -103,6 +116,10 @@ TEST(DEV_API, full_like) { float val = 1.0; pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::FullLike(dev_ctx, dense_x, val); diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc index 27fecd3fcd..c1f7d6aaba 100644 --- a/paddle/pten/tests/kernels/test_dot_dev_api.cc +++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/dot_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -57,6 +58,10 @@ TEST(DEV_API, dot) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Dot(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc index b3948843ee..9d4c86f026 100644 --- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/math_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -59,6 +60,10 @@ TEST(DEV_API, add) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Add(dev_ctx, dense_x, dense_y); // 3. check result @@ -107,6 +112,10 @@ TEST(DEV_API, subtract) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Subtract(dev_ctx, dense_x, dense_y); // 3. check result @@ -155,6 +164,10 @@ TEST(DEV_API, divide) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Divide(dev_ctx, dense_x, dense_y); // 3. check result @@ -203,6 +216,10 @@ TEST(DEV_API, multiply) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Multiply(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc index fc463d1ff1..2ebf10916b 100644 --- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc +++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/flatten_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -55,6 +56,10 @@ TEST(DEV_API, flatten) { } int start_axis = 1, stop_axis = 2; pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Flatten(dev_ctx, dense_x, start_axis, stop_axis); diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc index 40419ecb3a..87c91b1008 100644 --- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc +++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/matmul_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -54,6 +55,10 @@ TEST(DEV_API, dot) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = Matmul(dev_ctx, dense_x, dense_y, false, false); // 3. check result diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc index 786492d3a1..3abf54d26a 100644 --- a/paddle/pten/tests/kernels/test_mean_dev_api.cc +++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/math_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -47,6 +48,10 @@ TEST(DEV_API, mean) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Mean(dev_ctx, dense_x, dims, false); // 3. check result diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc index ac2bb60cf9..fe9b09c255 100644 --- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc +++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/reshape_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -47,6 +48,10 @@ TEST(DEV_API, reshape) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Reshape(dev_ctx, dense_x, shape); // 3. check result std::vector expect_shape = {12, 3}; diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc index abb592cde3..80f1295009 100644 --- a/paddle/pten/tests/kernels/test_scale_dev_api.cc +++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/scale_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -47,6 +48,10 @@ TEST(DEV_API, scale) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); @@ -85,6 +90,10 @@ TEST(DEV_API, scale_host) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc index 595f0b9692..9b48d8908f 100644 --- a/paddle/pten/tests/kernels/test_sum_dev_api.cc +++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc @@ -17,10 +17,10 @@ limitations under the License. */ #include "paddle/pten/kernels/math_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" - namespace pten { namespace tests { @@ -46,6 +46,10 @@ TEST(DEV_API, sum) { std::vector axis = {0, 1}; pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Sum(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false); diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index 48f95472c7..1e856a0fe9 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -13,6 +13,7 @@ # limitations under the License. import os +os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true" import sys import unittest import paddle -- GitLab