diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 7cdac0de6138f13325500759c0ca2a392e2000f9..0f725a454c8448ff354fd119099f09a9eaeda9dc 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -840,6 +840,28 @@ void* AllocatorFacade::GetBasePtr( return m_->GetBasePtr(allocation); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +const std::shared_ptr& AllocatorFacade::GetAllocator( + const platform::Place& place, const gpuStream_t& stream) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, + /* A non-zero num to choose allocator_ */ 1); + } +#endif + return m_->GetAllocator(place, stream, /*create_if_not_found=*/true); + } + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); +} +#endif + +const std::shared_ptr& AllocatorFacade::GetZeroAllocator( + const platform::Place& place) { + return m_->GetAllocator(place, /* zero size */ 0); +} + std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { return std::shared_ptr(Alloc(place, size)); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index a9b92e1801e4a3c74941388f864172f078d7128a..f4aea98003abe077d34415604980a2f165a3fd83 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -53,6 +53,14 @@ class AllocatorFacade { void* GetBasePtr(const std::shared_ptr& allocation); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + const std::shared_ptr& GetAllocator(const platform::Place& place, + const gpuStream_t& stream); +#endif + + const std::shared_ptr& GetZeroAllocator( + const platform::Place& place); + // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 142e30d161ccadf3c3cb55eee430597e60d50624..fdd9883c2c9244c8b7a4dc9d623974b36d43dd0d 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -23,6 +23,7 @@ limitations under the License. */ #endif #include "glog/logging.h" #include "paddle/fluid/framework/expect.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -136,11 +137,39 @@ inline void EmplaceDeviceContext( map_ptr, platform::Place p) { using PtrType = std::unique_ptr; - map_ptr->emplace(p, std::async(std::launch::deferred, [=] { - // lazy evaluation. i.e., only create device context at - // first `Get` - return PtrType(new DevCtx(p)); - })); + map_ptr->emplace( + p, std::async(std::launch::deferred, [=] { + // lazy evaluation. i.e., only create device context at + // first `Get` + auto* dev_ctx = new DevCtx(p); + if (is_gpu_place(p)) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto* cuda_ctx = dynamic_cast(dev_ctx); + PADDLE_ENFORCE_NOT_NULL( + cuda_ctx, + platform::errors::InvalidArgument( + "Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); + dev_ctx->SetDeviceAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p, cuda_ctx->context()->RawStream()) + .get()); +#endif + } else { + dev_ctx->SetDeviceAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(p) + .get()); + } + dev_ctx->SetHostAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(platform::CPUPlace()) + .get()); + dev_ctx->SetZeroAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(p) + .get()); + return PtrType(dev_ctx); + })); } DeviceContextPool::DeviceContextPool( diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index 15f9f0bda3c25e2b8a4125d1025d8b0a673f2dc5..7373ba79c0a5cfeb537ecda7ec318e533e23b555 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -68,6 +68,45 @@ bool DenseTensor::IsSharedWith(const DenseTensor& b) const { return holder_ && holder_ == b.Holder(); } +void* DenseTensor::AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + paddle::platform::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + if (this->dtype() != dtype) { + VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype; + meta_.dtype = dtype; + } + PADDLE_ENFORCE( + valid(), + paddle::platform::errors::PreconditionNotMet( + "The meta data must be valid when call the mutable data function.")); + size_t bytes = numel() * SizeOf(this->dtype()); + if (requested_size) { + PADDLE_ENFORCE_GE(requested_size, + bytes, + paddle::platform::errors::InvalidArgument( + "The reserved size %d should be enough to meet the " + "volume required by metadata %d.", + requested_size, + bytes)); + bytes = requested_size; + } + // TODO(paddle-dev): In case of the allocator of storage_ is different with + // the incoming allocator, we should re-alloc data using the incoming + // allocator. + if (!holder_ || holder_->size() < bytes + meta_.offset) { + meta_.offset = 0; + VLOG(10) << "Allocate data with bytes: " << bytes; + ResetHolder(allocator->Allocate(bytes)); + } + + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + meta_.offset); +} + template const T* DenseTensor::data() const { check_memory_size(); diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 2823441f97da2a784d6fb175429a0496e50d6aaa..fbecbcf0a1f2682abc87c7dbf605ead291fdf8fb 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -124,6 +124,12 @@ class DenseTensor : public TensorBase, /// return Whether the storage is allocated. bool initialized() const override { return holder_ && holder_->ptr(); } + /// \brief Allocate memory with requested size from allocator. + /// \return The mutable data pointer value of type T. + void* AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size = 0) override; + /// \brief Check if storage is shared with other objects. /// \return Whether the storage is shared with other objects. bool IsSharedWith(const DenseTensor& b) const; diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc index 7566b351bf63401acba3bad247b10bd7bb3c9cf1..d6e01c5c6e66494b71d5d7fad763bb5dd7b5b138 100644 --- a/paddle/pten/core/device_context.cc +++ b/paddle/pten/core/device_context.cc @@ -13,45 +13,119 @@ // limitations under the License. #include "paddle/pten/core/device_context.h" -#include "paddle/pten/api/ext/exception.h" +#include "paddle/pten/core/enforce.h" +#include "paddle/pten/core/tensor_base.h" namespace pten { +using DataType = paddle::experimental::DataType; struct DeviceContext::Impl { Impl() = default; ~Impl() = default; - void SetDeviceAllocator(Allocator* allocator) { + void SetDeviceAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + pten::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); device_allocator_ = allocator; } - void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; } + void SetHostAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + pten::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + host_allocator_ = allocator; + } + + void SetZeroAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + pten::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + zero_allocator_ = allocator; + } const Allocator& GetDeviceAllocator() const { - PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr."); + PADDLE_ENFORCE_NOT_NULL( + device_allocator_, + pten::errors::InvalidArgument("Required device_allocator_ shall not be " + "nullptr, but received nullptr.")); return *device_allocator_; } const Allocator& GetHostAllocator() const { - PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr."); + PADDLE_ENFORCE_NOT_NULL( + host_allocator_, + pten::errors::InvalidArgument("Required host_allocator_ shall not be " + "nullptr, but received nullptr.")); return *host_allocator_; } - // TODO(Wilber): Add impl. It seems that tensorbase not have interface to - // communicate with allocator. - void HostAlloc(TensorBase* tensor) {} - void DeviceAlloc(TensorBase* tensor) {} + const Allocator& GetZeroAllocator() const { + PADDLE_ENFORCE_NOT_NULL( + zero_allocator_, + pten::errors::InvalidArgument("Required host_allocator_ shall not be " + "nullptr, but received nullptr.")); + return *zero_allocator_; + } + + void* Alloc(TensorBase* tensor, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const { + PADDLE_ENFORCE_NOT_NULL( + tensor, + pten::errors::InvalidArgument( + "Required tensor shall not be nullptr, but received nullptr.")); + if (dtype == DataType::UNDEFINED) { + dtype = tensor->dtype(); + } + auto* allocator = + tensor->numel() == 0 ? zero_allocator_ : device_allocator_; + return tensor->AllocateFrom( + const_cast(allocator), dtype, requested_size); + } + + template + T* Alloc(TensorBase* tensor, size_t requested_size = 0) const { + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); + return static_cast(Alloc(tensor, dtype, requested_size)); + } - Allocator* device_allocator_{nullptr}; - Allocator* host_allocator_{nullptr}; + void* HostAlloc(TensorBase* tensor, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const { + PADDLE_ENFORCE_NOT_NULL( + tensor, + pten::errors::InvalidArgument( + "Required tensor shall not be nullptr, but received nullptr.")); + if (dtype == DataType::UNDEFINED) { + dtype = tensor->dtype(); + } + auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_; + return tensor->AllocateFrom( + const_cast(allocator), dtype, requested_size); + } + + template + T* HostAlloc(pten::TensorBase* tensor, size_t requested_size = 0) const { + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); + return static_cast(HostAlloc(tensor, dtype, requested_size)); + } + + private: + const Allocator* device_allocator_{nullptr}; + const Allocator* host_allocator_{nullptr}; + const Allocator* zero_allocator_{nullptr}; }; DeviceContext::DeviceContext() { impl_ = std::make_unique(); } DeviceContext::DeviceContext(const DeviceContext& other) { - impl_->SetDeviceAllocator( - const_cast(&other.GetDeviceAllocator())); - impl_->SetHostAllocator(const_cast(&other.GetHostAllocator())); + impl_->SetHostAllocator(&other.GetHostAllocator()); + impl_->SetDeviceAllocator(&other.GetDeviceAllocator()); + impl_->SetZeroAllocator(&other.GetZeroAllocator()); } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -60,26 +134,71 @@ DeviceContext::DeviceContext(DeviceContext&& other) { DeviceContext::~DeviceContext() = default; -void DeviceContext::SetHostAllocator(Allocator* allocator) { - impl_->SetHostAllocator(allocator); +void DeviceContext::SetDeviceAllocator(const Allocator* allocator) { + impl_->SetDeviceAllocator(allocator); } -void DeviceContext::SetDeviceAllocator(Allocator* allocator) { - impl_->SetDeviceAllocator(allocator); +const Allocator& DeviceContext::GetDeviceAllocator() const { + return impl_->GetDeviceAllocator(); +} + +void DeviceContext::SetHostAllocator(const Allocator* allocator) { + impl_->SetHostAllocator(allocator); } const Allocator& DeviceContext::GetHostAllocator() const { return impl_->GetHostAllocator(); } -const Allocator& DeviceContext::GetDeviceAllocator() const { - return impl_->GetDeviceAllocator(); +void DeviceContext::SetZeroAllocator(const Allocator* allocator) { + impl_->SetZeroAllocator(allocator); } -void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); } +const Allocator& DeviceContext::GetZeroAllocator() const { + return impl_->GetZeroAllocator(); +} -void DeviceContext::DeviceAlloc(TensorBase* tensor) { - impl_->DeviceAlloc(tensor); +void* DeviceContext::Alloc(TensorBase* tensor, + DataType dtype, + size_t requested_size) const { + return impl_->Alloc(tensor, dtype, requested_size); } +template +T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const { + return impl_->Alloc(tensor, requested_size); +} + +void* DeviceContext::HostAlloc(TensorBase* tensor, + DataType dtype, + size_t requested_size) const { + return impl_->HostAlloc(tensor, dtype, requested_size); +} + +template +T* DeviceContext::HostAlloc(TensorBase* tensor, size_t requested_size) const { + return impl_->HostAlloc(tensor, requested_size); +} + +#define DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(dtype) \ + template dtype* DeviceContext::Alloc(TensorBase* tensor, \ + size_t requested_size) const; \ + template dtype* DeviceContext::HostAlloc(TensorBase* tensor, \ + size_t requested_size) const; + +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(bool) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int8_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(uint8_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int16_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int32_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int64_t) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(float) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(double) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::bfloat16) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::float16) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64) +DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128) + +#undef DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION + } // namespace pten diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h index c658a24c3527d50efacc9b2b768ac8f07c07b338..30be5cd22dd4e07980d939beca34fad4d9aa00e3 100644 --- a/paddle/pten/core/device_context.h +++ b/paddle/pten/core/device_context.h @@ -19,6 +19,7 @@ limitations under the License. */ // TODO(wilber): Do we need to use place in pten kernel? #include "paddle/pten/common/place.h" +#include "paddle/pten/common/data_type.h" #include "paddle/pten/core/allocator.h" namespace pten { @@ -31,6 +32,8 @@ class TensorBase; * DeviceContext. */ class DeviceContext { + using DataType = paddle::experimental::DataType; + public: /** * @brief Default construct. @@ -53,42 +56,61 @@ class DeviceContext { virtual ~DeviceContext(); /** - * @brief Set the deveice-releated Allocator object. + * @brief Set the device-related Allocator object. * * @param allocator */ - void SetDeviceAllocator(Allocator*); + void SetDeviceAllocator(const Allocator*); /** - * @brief Get the const deveice-releated Allocator object. + * @brief Set the host Allocator object. * - * @return Allocator + * @param allocator */ - const Allocator& GetDeviceAllocator() const; + void SetHostAllocator(const Allocator*); /** - * @brief Allocate device memory for tensor. - */ - void DeviceAlloc(pten::TensorBase*); + * @brief Set the zero-size Allocator object. + * + * @param allocator + */ + void SetZeroAllocator(const Allocator*); /** - * @brief Set the host Allocator object. + * @brief Get the const Allocator object. * - * @param allocator + * @return Allocator */ - void SetHostAllocator(Allocator*); + const Allocator& GetDeviceAllocator() const; /** - * @brief Get the const host Allocator object. + * @brief Get the const device-related Allocator object. * * @return Allocator */ const Allocator& GetHostAllocator() const; + const Allocator& GetZeroAllocator() const; + + /** + * @brief Allocate device memory for tensor. + */ + void* Alloc(TensorBase*, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const; + + template + T* Alloc(TensorBase* tensor, size_t requested_size = 0) const; + /** * @brief Allocate host memory for tensor. */ - void HostAlloc(pten::TensorBase*); + void* HostAlloc(TensorBase* tensor, + DataType dtype = DataType::UNDEFINED, + size_t requested_size = 0) const; + + template + T* HostAlloc(TensorBase* tensor, size_t requested_size = 0) const; // TODO(wilber): Just for the convenience of migrating the code, it will be // modified or removed later. diff --git a/paddle/pten/core/selected_rows.cc b/paddle/pten/core/selected_rows.cc index 1dfcfa49347b50d305c2b37ccc4379eedb08a107..7578faf6143daa5cf4914505f413ff27aa6860e1 100644 --- a/paddle/pten/core/selected_rows.cc +++ b/paddle/pten/core/selected_rows.cc @@ -91,6 +91,12 @@ struct TensorFillVisitor { int64_t size_; }; +void* SelectedRows::AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size) { + return value_->AllocateFrom(allocator, dtype, requested_size); +} + bool SelectedRows::HasKey(int64_t key) const { return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false : true; diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h index e12f59d02f2ba21054700248404640730614b277..2f224e42ea070e8888510c5e34813d3974327f7b 100644 --- a/paddle/pten/core/selected_rows.h +++ b/paddle/pten/core/selected_rows.h @@ -113,6 +113,10 @@ class SelectedRows : public TensorBase, bool auto_grown = false, bool is_test = false); + void* AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size = 0) override; + /* * @brief Get the index of the key from id_to_index_ map. If the key not * exist, diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h index 662553cbcb5986daae13c11cb43b2ecf36bc12c2..7a5e42da4908b2509ab3ce205650d7ec89d5d1f6 100644 --- a/paddle/pten/core/tensor_base.h +++ b/paddle/pten/core/tensor_base.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/common/backend.h" #include "paddle/pten/common/data_type.h" #include "paddle/pten/common/layout.h" +#include "paddle/pten/core/allocator.h" #include "paddle/pten/core/ddim.h" #include "paddle/pten/core/storage.h" #include "paddle/pten/core/utils/type_registry.h" @@ -61,6 +62,16 @@ class TensorBase { /// return Whether the storage is allocated. virtual bool initialized() const = 0; + // TODO(Aurelius84): This interface is under intermediate state now. + // We will remove DataType argument in the future. Please DO NOT + // rely on Datatype to much when design and implement other feature. + + /// \brief Allocate memory with requested size from allocator. + /// \return The mutable data pointer value of type T. + virtual void* AllocateFrom(Allocator* allocator, + DataType dtype, + size_t requested_size = 0) = 0; + /// \brief Return the type information of the derived class to support /// safely downcast in non-rtti environment. /// return The type information of the derived class. diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc index edb8f59e2677199dde7ca1a0ae7fed76e655e81f..24371ca7690de6ff45020499a9ca667e42934bae 100644 --- a/paddle/pten/kernels/cpu/cast_kernel.cc +++ b/paddle/pten/kernels/cpu/cast_kernel.cc @@ -36,7 +36,7 @@ void CastKernelImpl(const CPUContext& dev_ctx, auto numel = x.numel(); auto* in_end = in_begin + numel; - auto* out_begin = out->mutable_data(dev_ctx.GetPlace()); + auto* out_begin = dev_ctx.Alloc(out); paddle::platform::Transform trans; trans(dev_ctx, diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc index be5170f4d05aab459df45fc6c36e0f34511c22b0..0892e3974febd1bc4c8ac890cd123066b960e382 100644 --- a/paddle/pten/kernels/cpu/copy_kernel.cc +++ b/paddle/pten/kernels/cpu/copy_kernel.cc @@ -37,7 +37,7 @@ void Copy(const Context& dev_ctx, << src_place; dst->Resize(src.dims()); - auto* dst_ptr = dst->mutable_data(src_place); + auto* dst_ptr = dev_ctx.Alloc(dst); if (src_ptr == dst_ptr) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc index e6ffd3b5000b3f8152d6d2f9840b5379408022e7..5cef8d0bdd56d08731d617f0bd9c732fe1688af5 100644 --- a/paddle/pten/kernels/cpu/dot_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_kernel.cc @@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx, DenseTensor* out) { auto const *x_ptr = x.data(), *x_ptr_ = &x_ptr[0]; auto const *y_ptr = y.data(), *y_ptr_ = &y_ptr[0]; - auto* z = out->mutable_data(dev_ctx.GetPlace()); + T* z = dev_ctx.template Alloc(out); // Loop over the total N elements of both operands while sum-reducing every // B pairs along the way where B is the dimension of the least ordered axis diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h index 179a1881189222e18f2dde14c35c14caadc831f4..2d717414d70f5463fc12d6ec64774351d36bcc7e 100644 --- a/paddle/pten/kernels/cpu/elementwise.h +++ b/paddle/pten/kernels/cpu/elementwise.h @@ -45,10 +45,8 @@ struct SameDimsAddFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VADD(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VADD( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -61,7 +59,7 @@ struct SameDimsAddFunctor< const DenseTensor& x, const DenseTensor& y, DenseTensor* z) { - z->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(z); auto eigen_x = pten::EigenVector::Flatten(x); auto eigen_y = pten::EigenVector::Flatten(y); auto eigen_z = pten::EigenVector::Flatten(*z); @@ -89,10 +87,8 @@ struct SameDimsSubtractFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VSUB(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VSUB( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -147,10 +143,8 @@ struct SameDimsDivideFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VDIV(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VDIV( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -173,10 +167,8 @@ struct SameDimsMultiplyFunctor< const DenseTensor& y, DenseTensor* z) { auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VMUL(x.numel(), - x.data(), - y.data(), - z->mutable_data(dev_ctx.GetPlace())); + blas.VMUL( + x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); } }; @@ -241,8 +233,8 @@ void CommonGradBroadcastCPU(const DenseTensor& x, const T* y_data = y.data(); const Tout* out_data = out.data(); const Tout* dout_data = dout.data(); - T* dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); - T* dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); + T* dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); + T* dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); if (dx_data != nullptr) { memset(dx_data, 0, dx->numel() * sizeof(T)); } @@ -292,7 +284,7 @@ void CommonForwardBroadcastCPU(const DenseTensor& x, PADDLE_ENFORCE_NOT_NULL(y_data, paddle::platform::errors::InvalidArgument( "The input Y should not be empty.")); - OutType* out_data = z->mutable_data(ctx.GetPlace()); + OutType* out_data = ctx.Alloc(z); const int out_size = std::accumulate( out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); @@ -373,7 +365,7 @@ void ElementwiseCompute(const CPUContext& dev_ctx, int axis, Functor func, DenseTensor* z) { - z->mutable_data(dev_ctx.GetPlace()); + dev_ctx.Alloc(z); auto x_dims = x.dims(); auto y_dims = y.dims(); bool is_xsize_larger = true; @@ -677,32 +669,30 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx, return; } if (post == 1) { - ElemwiseGradBroadcast1CPU( - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast1CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } else { - ElemwiseGradBroadcast2CPU( - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - post, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast2CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } } diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index 6d76626605c5c7bd3ea39470e824c57cb2a6484d..d4987e7a3606987ab64449e1346c788431895788 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -37,7 +37,7 @@ namespace pten { const DenseTensor& y, \ int axis, \ DenseTensor* out) { \ - out->mutable_data(dev_ctx.GetPlace()); \ + dev_ctx.template Alloc(out); \ if (x.dims() == y.dims()) { \ SameDimsElementwiseCompute>()( \ dev_ctx, x, y, out); \ @@ -85,7 +85,7 @@ void DivideRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { // allocate memory for out - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); if (x.dims() == y.dims() && std::is_floating_point::value) { SameDimsElementwiseCompute>()( dev_ctx, x, y, out); diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h index 8f84bd0515b516e25821f8fa84d6935aa6260032..2b0659ac2e35746634fe01e6ee5263aabf4806a6 100644 --- a/paddle/pten/kernels/cpu/reduce.h +++ b/paddle/pten/kernels/cpu/reduce.h @@ -119,7 +119,7 @@ void GetShuffledInput(const DeviceContext& dev_ctx, GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis); shuffled_input->ResizeAndAllocate(shuffled_dims); - shuffled_input->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(shuffled_input); pten::math::TransposeNormal trans; trans(dev_ctx, input, shuffled_input, perm_axis); @@ -158,7 +158,7 @@ void ReduceKernelImpl(const DeviceContext& dev_ctx, const std::vector& dims, bool keep_dim, bool reduce_all) { - output->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(output); if (reduce_all) { // Flatten and reduce 1-D tensor diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc index 774d3891b03726a940b6f31a4058e37f3c79277d..4f999ac4d17ecac4d9a9959b82a4681f1643c386 100644 --- a/paddle/pten/kernels/cpu/scale_kernel.cc +++ b/paddle/pten/kernels/cpu/scale_kernel.cc @@ -33,7 +33,7 @@ void ScaleKernel(const Context& dev_ctx, bool bias_after_scale, DenseTensor* out) { // calc - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); auto eigen_out = pten::EigenVector::Flatten(*out); auto eigen_x = pten::EigenVector::Flatten(x); auto& dev = *dev_ctx.eigen_device(); diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index 6ce4998287956c29140c8c3690661b2e92a6f450..ecb058d35b909bc9455b019e55ab8f2277fd587b 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -29,7 +29,7 @@ void EmptyKernel(const Context& dev_ctx, template void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); } } // namespace pten diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index 9ea27fd9c5b8d5f9b9a4d6fb0d6cb608d13f5984..3f1651eeb276f0828daed7bee213030bd80de72b 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -229,7 +229,7 @@ class TransformFunctor { const bool is_xsize_larger = true) : x_(x.data()), y_(y.data()), - z_(z->mutable_data(ctx.GetPlace())), + z_(ctx.template Alloc(z)), nx_(x.numel()), ctx_(ctx), func_(func), @@ -425,8 +425,8 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, dout.data(), dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(dev_ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(dev_ctx.GetPlace())}); + dx == nullptr ? nullptr : dev_ctx.template Alloc(dx), + dy == nullptr ? nullptr : dev_ctx.template Alloc(dy)}); } inline void ElementwiseGradPreProcess(const DenseTensor &dout, @@ -631,7 +631,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ins_data[i] = ins[i]->data(); } for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = (*outs)[i]->mutable_data(ctx.GetPlace()); + outs_data[i] = ctx.Alloc((*outs)[i]); } #ifdef PADDLE_WITH_XPU2 int block_size = 64; diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc index 13cfaedb33d38ee2bb6052ea622fc59b659f581a..7d4dc3c7ce8f00fece82e5a27af5347b5d5cfabf 100644 --- a/paddle/pten/kernels/funcs/transpose.cc +++ b/paddle/pten/kernels/funcs/transpose.cc @@ -36,7 +36,7 @@ struct TransposeNormal { auto in_stride = pten::framework::stride(in.dims()); auto out_stride = pten::framework::stride(out->dims()); const T* in_ptr = in.data(); - T* out_ptr = out->mutable_data(dev_ctx.GetPlace()); + T* out_ptr = dev_ctx.template Alloc(out); auto transpose_helper = [&](int64_t beg, int64_t end) { for (int64_t out_idx = beg; out_idx < end; ++out_idx) { diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu index 24d72ca3d81ce455bcaee2d9d82261707674fb2c..a7b7184487c962af14a9a92510bca14103bcf2f5 100644 --- a/paddle/pten/kernels/funcs/transpose.cu +++ b/paddle/pten/kernels/funcs/transpose.cu @@ -61,7 +61,7 @@ struct TransposeNormal { auto in_stride = pten::framework::stride(in.dims()); auto out_stride = pten::framework::stride(out->dims()); auto* in_ptr = in.data(); - auto* out_ptr = out->mutable_data(dev_ctx.GetPlace()); + T* out_ptr = dev_ctx.template Alloc(out); // copy in_stride, out_stride, axis to gpu device const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace(); diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index 12f246c3238d067a49032f077d472609c570cb93..81d09ef164652f50cc004998990c3670e1eaed66 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -43,7 +43,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx, std::vector outputs; inputs.emplace_back(&x); outputs.emplace_back(out); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.Alloc(out); pten::funcs::LaunchSameDimsElementwiseCudaKernel( diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu index 75aacc8d3d1179861526a68bc9a30cb27340adf8..24bd034fb15a0df3e19c60bfebb56f90c72da75a 100644 --- a/paddle/pten/kernels/gpu/dot_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_kernel.cu @@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, DenseTensor* out) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); if (1 == out->dims().size()) { auto eigen_out = pten::EigenScalar::From(*out); auto eigen_x = pten::EigenVector::Flatten(x); diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index 9a3ae7f12dfcd62a1a18154971fa99ab72c5561d..6f744212cd5b6ed8aafd9e48313517e82b27c27c 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -352,7 +352,7 @@ void LaunchKernel(const KPDevice &ctx, pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data; for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = (*outs)[i]->mutable_data(ctx.GetPlace()); + outs_data[i] = ctx.Alloc((*outs)[i]); } for (int i = 0; i < Arity; i++) { @@ -1264,8 +1264,8 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, const T *y_data = y.data(); const Tout *out_data = out.data(); const Tout *dout_data = dout.data(); - T *dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); - T *dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); + T *dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); + T *dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); std::vector x_one_indexs; std::vector y_one_indexs; @@ -1923,34 +1923,32 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, return; } if (post == 1) { - ElemwiseGradBroadcast1CUDA( - ctx.stream(), - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast1CUDA(ctx.stream(), + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } else { - ElemwiseGradBroadcast2CUDA( - ctx.stream(), - x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - post, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + ElemwiseGradBroadcast2CUDA(ctx.stream(), + x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); } } diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 996d85d3f42a7996e481a4887a0d0f4fa2587893..37104c46a49427828f75e09c5b8aa544051df530 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -47,7 +47,7 @@ namespace pten { inputs.emplace_back(&x); \ inputs.emplace_back(&y); \ outputs.emplace_back(out); \ - out->mutable_data(dev_ctx.GetPlace()); \ + dev_ctx.template Alloc(out); \ LaunchElementwiseCudaKernel( \ dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ } diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index dd7c2f242ea4dbb473702c63716393566cd912c5..5aba001267a0a26bf587ff201a1c71462d094865 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -54,7 +54,7 @@ void ScaleKernel(const Context& dev_ctx, std::vector outputs; inputs.emplace_back(&x); outputs.emplace_back(out); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); pten::funcs::LaunchSameDimsElementwiseCudaKernel( diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h index aa878f7e9eb7f157ab193ff5129e5752aded67e6..7e4c4f0d66d4fc89634eb7bde9eb24e2743d4a7c 100644 --- a/paddle/pten/kernels/impl/complex_kernel_impl.h +++ b/paddle/pten/kernels/impl/complex_kernel_impl.h @@ -26,7 +26,7 @@ void ConjKernel(const Context& dev_ctx, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - auto* out_data = out->mutable_data(dev_ctx.GetPlace()); + auto* out_data = dev_ctx.template Alloc(out); paddle::platform::ForRange for_range(dev_ctx, numel); paddle::operators::math::ConjFunctor functor(x_data, numel, out_data); diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h index d0c6cf6793e6d37e3aad4a3a601280a9f02d0013..d4ea9fc944527145269fdfd1a854aca1299a6018 100644 --- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h @@ -73,7 +73,7 @@ struct DotGradFunction::From(*tensor_dout); if (tensor_dx) { - tensor_dx->mutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dx); auto y = EigenMatrix::From(*tensor_y); auto& dev = *ctx.eigen_device(); Eigen::DSizes size(1, tensor_dx->dims()[1]); @@ -85,7 +85,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dy); auto x = EigenMatrix::From(*tensor_x); auto& dev = *ctx.eigen_device(); Eigen::DSizes size(1, tensor_dy->dims()[1]); @@ -100,7 +100,7 @@ struct DotGradFunctiondata(); if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); + auto* data_dx = ctx.template Alloc(tensor_dx); const auto* data_y = tensor_y->data(); const DDim& dim = tensor_x->dims(); size_t N = static_cast(pten::framework::product(dim)); @@ -115,7 +115,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + auto* data_dy = ctx.template Alloc(tensor_dy); const auto* data_x = tensor_x->data(); const DDim& dim = tensor_y->dims(); size_t N = static_cast(pten::framework::product(dim)); @@ -164,7 +164,7 @@ struct DotGradFunction::From(*tensor_dout); if (tensor_dx) { - tensor_dx->mutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dx); auto y = EigenMatrix::From(*tensor_y); auto dx = EigenMatrix::From(*tensor_dx); auto& dev = *ctx.eigen_device(); @@ -173,7 +173,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dy); auto x = EigenMatrix::From(*tensor_x); auto dy = EigenMatrix::From(*tensor_dy); auto& dev = *ctx.eigen_device(); @@ -189,7 +189,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + auto* dx = ctx.template Alloc(tensor_dx); for (auto j = 0; j < N / B; ++j) { auto const ss = dz[j]; for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss; @@ -197,7 +197,7 @@ struct DotGradFunctionmutable_data(ctx.GetPlace()); + auto* dy = ctx.template Alloc(tensor_dy); for (auto j = 0; j < N / B; ++j) { auto const ss = dz[j]; for (auto i = 0; i < B; i++) *dy++ = *x++ * ss; @@ -272,7 +272,7 @@ struct DotDoubleGradFunctiondata(); if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); + auto* data_dx = ctx.template Alloc(tensor_dx); const auto* data_ddy = tensor_ddy->data(); const DDim& dim = tensor_dx->dims(); size_t N = static_cast(product(dim)); @@ -287,7 +287,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_dy = ctx.template Alloc(tensor_dy); const auto* data_ddx = tensor_ddx->data(); const DDim& dim = tensor_dy->dims(); size_t N = static_cast(product(dim)); @@ -302,7 +302,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_ddout = ctx.template Alloc(tensor_ddout); auto* data_x = tensor_x->data(); auto* data_y = tensor_y->data(); auto* data_ddx = tensor_ddx->data(); @@ -351,7 +351,7 @@ struct DotDoubleGradFunction::Flatten(*tensor_dout); if (tensor_dx) { - tensor_dx->mutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dx); auto ddy = EigenVector::Flatten(*tensor_ddy); Eigen::DSizes size(tensor_ddy->numel()); auto dx = EigenVector::Flatten(*tensor_dx); @@ -359,7 +359,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_dy); auto ddx = EigenVector::Flatten(*tensor_ddx); Eigen::DSizes size(tensor_ddx->numel()); @@ -368,7 +368,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(tensor_ddout); auto x = EigenVector::Flatten(*tensor_x); auto y = EigenVector::Flatten(*tensor_y); auto ddx = EigenVector::Flatten(*tensor_ddx); @@ -381,7 +381,7 @@ struct DotDoubleGradFunctiondata(); if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); + auto* data_dx = ctx.template Alloc(tensor_dx); const auto* data_ddy = tensor_ddy->data(); const DDim& dim = tensor_dx->dims(); size_t N = static_cast(product(dim)); @@ -396,7 +396,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_dy = ctx.template Alloc(tensor_dy); const auto* data_ddx = tensor_ddx->data(); const DDim& dim = tensor_dy->dims(); size_t N = static_cast(product(dim)); @@ -411,7 +411,7 @@ struct DotDoubleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_ddout = ctx.template Alloc(tensor_ddout); auto* data_x = tensor_x->data(); auto* data_y = tensor_y->data(); auto* data_ddx = tensor_ddx->data(); @@ -552,7 +552,7 @@ struct DotTripleGradFunctiondata(); if (out_tensor_d_x) { - auto* data_d_x = out_tensor_d_x->mutable_data(ctx.GetPlace()); + auto* data_d_x = ctx.template Alloc(out_tensor_d_x); const auto* data_ddy = in_tensor_ddy->data(); const DDim& dim = out_tensor_d_x->dims(); @@ -567,7 +567,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_y = ctx.template Alloc(out_tensor_d_y); const auto* data_ddx = in_tensor_ddx->data(); const DDim& dim = out_tensor_d_y->dims(); @@ -582,7 +582,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); auto* data_ddx = in_tensor_ddx->data(); auto* data_ddy = in_tensor_ddy->data(); auto* data_d_dx = in_tensor_d_dx->data(); @@ -613,7 +613,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); auto* data_dout = in_tensor_dout->data(); auto* data_d_dy = in_tensor_d_dy->data(); auto* data_y = in_tensor_y->data(); @@ -633,7 +633,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); auto* data_dout = in_tensor_dout->data(); auto* data_d_dx = in_tensor_d_dx->data(); auto* data_x = in_tensor_x->data(); @@ -678,7 +678,7 @@ struct DotTripleGradFunction::Flatten(*in_tensor_d_ddout); if (out_tensor_d_x) { - out_tensor_d_x->mutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_x); auto ddy = EigenVector::Flatten(*in_tensor_ddy); Eigen::DSizes size(in_tensor_ddy->numel()); auto d_x = EigenVector::Flatten(*out_tensor_d_x); @@ -686,7 +686,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_y); auto ddx = EigenVector::Flatten(*in_tensor_ddx); Eigen::DSizes size(in_tensor_ddx->numel()); @@ -695,7 +695,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_dout); auto ddx = EigenVector::Flatten(*in_tensor_ddx); auto ddy = EigenVector::Flatten(*in_tensor_ddy); auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); @@ -705,7 +705,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_ddx); auto dout = EigenVector::Flatten(*in_tensor_dout); auto y = EigenVector::Flatten(*in_tensor_y); auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); @@ -717,7 +717,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + ctx.template Alloc(out_tensor_d_ddy); auto dout = EigenVector::Flatten(*in_tensor_dout); auto x = EigenVector::Flatten(*in_tensor_x); auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); @@ -732,7 +732,7 @@ struct DotTripleGradFunctiondata(); if (out_tensor_d_x) { - auto* data_d_x = out_tensor_d_x->mutable_data(ctx.GetPlace()); + auto* data_d_x = ctx.template Alloc(out_tensor_d_x); const auto* data_ddy = in_tensor_ddy->data(); const DDim& dim = out_tensor_d_x->dims(); @@ -747,7 +747,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_y = ctx.template Alloc(out_tensor_d_y); const auto* data_ddx = in_tensor_ddx->data(); const DDim& dim = out_tensor_d_y->dims(); @@ -762,7 +762,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); auto* data_ddx = in_tensor_ddx->data(); auto* data_ddy = in_tensor_ddy->data(); auto* data_d_dx = in_tensor_d_dx->data(); @@ -790,7 +790,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); auto* data_dout = in_tensor_dout->data(); auto* data_d_dy = in_tensor_d_dy->data(); auto* data_y = in_tensor_y->data(); @@ -809,7 +809,7 @@ struct DotTripleGradFunctionmutable_data(ctx.GetPlace()); + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); auto* data_dout = in_tensor_dout->data(); auto* data_d_dx = in_tensor_d_dx->data(); auto* data_x = in_tensor_x->data(); @@ -838,10 +838,10 @@ void DotGradKernel(const Context& dev_ctx, DenseTensor* dx, DenseTensor* dy) { if (dx) { - dx->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dx); } if (dy) { - dy->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dy); } DotGradFunction()(dev_ctx, &x, &y, &dout, dx, dy); } @@ -857,13 +857,13 @@ void DotDoubleGradKernel(const Context& dev_ctx, DenseTensor* dy, DenseTensor* ddout) { if (dx) { - dx->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dx); } if (dy) { - dy->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(dy); } if (ddout) { - ddout->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(ddout); } DotDoubleGradFunction()( dev_ctx, &x, &y, &dout, ddx, ddy, dx, dy, ddout); @@ -885,19 +885,19 @@ void DotTripleGradKernel(const Context& dev_ctx, DenseTensor* d_ddy, DenseTensor* d_dout) { if (d_x) { - d_x->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_x); } if (d_y) { - d_y->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_y); } if (d_ddx) { - d_ddx->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_ddx); } if (d_ddy) { - d_ddy->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_ddy); } if (d_dout) { - d_dout->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(d_dout); } DotTripleGradFunction()(dev_ctx, diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h index 4fee23e175c9ec28a492a3ff2cf37ba5c1234b92..4fbe9f34e5b4d9e683db4f623fe6195f21469f8d 100644 --- a/paddle/pten/kernels/impl/full_kernel_impl.h +++ b/paddle/pten/kernels/impl/full_kernel_impl.h @@ -26,7 +26,7 @@ namespace pten { template void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) { - tensor->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(tensor); auto t = pten::EigenVector::Flatten(*tensor); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(val)); } diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index fbcb073150cc5ecea4252d755c6f85e677bdf120..87785a2b4778a8cbb3d54dd308ca812d158eb042 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -105,7 +105,7 @@ void MatMul(const Context& dev_ctx, bool trans_b, DenseTensor* out, bool flag = false) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); auto blas = paddle::operators::math::GetBlas(dev_ctx); auto mat_dim_a = paddle::operators::math::CreateMatrixDescriptor(a.dims(), 0, trans_a); @@ -123,7 +123,7 @@ void MatMul(const Context& dev_ctx, b.data(), mat_dim_b, static_cast(1), - out->data(), + dev_ctx.template Alloc(out), static_cast(flag)); } @@ -242,8 +242,8 @@ void MatmulGradKernel(const Context& dev_ctx, // Case1 : x's or y's dim = 1 if (x_ndim == 1 && y_ndim == 1) { - if (dx) dx->mutable_data(dev_ctx.GetPlace()); - if (dy) dy->mutable_data(dev_ctx.GetPlace()); + if (dx) dev_ctx.template Alloc(dx); + if (dy) dev_ctx.template Alloc(dy); if (out_grad.numel() == 1) { DotGradFunction()(dev_ctx, &x, &y, &out_grad, dx, dy); return; diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h index e59a54c703ab543c5d27db5291b0b2cb9c6ee79b..858807a1d4d6496d5e3091aa71f5b2dada03b92e 100644 --- a/paddle/pten/kernels/impl/matmul_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h @@ -118,7 +118,7 @@ void MatMulFunction(const Context& dev_ctx, N)); VLOG(3) << "MatMul's case 1"; Out->Resize({1}); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); blas.GEMM(CblasNoTrans, CblasTrans, 1, @@ -128,7 +128,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); return; } @@ -165,7 +165,7 @@ void MatMulFunction(const Context& dev_ctx, out_dims.back() = y_dims.back(); } Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims)); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); if (trans_y) { const int M = Y.numel() / N; VLOG(3) << "MatMul's case 2"; @@ -176,7 +176,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { const int M = y_dims[y_ndim - 1]; const int batch_size = Y.numel() / (M * N); @@ -189,7 +189,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 4"; blas.BatchedGEMM(CblasTrans, @@ -201,7 +201,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), batch_size, M * N, 0); @@ -243,7 +243,7 @@ void MatMulFunction(const Context& dev_ctx, std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin()); } Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims)); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); if (trans_x) { const int M = x_dims[x_ndim - 1]; @@ -257,7 +257,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 6"; blas.BatchedGEMM(CblasTrans, @@ -269,7 +269,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), batch_size, M * N, 0); @@ -284,7 +284,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } return; } @@ -331,7 +331,7 @@ void MatMulFunction(const Context& dev_ctx, out_broadcast_dims[ndim - 1] = N; Out->ResizeAndAllocate(pten::framework::make_ddim(out_broadcast_dims)); - Out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(Out); const int batch_dim = ndim - 2; // broadcast message @@ -367,7 +367,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else if (x_batch_size == 1) { if (M == 1 && trans_y) { VLOG(3) << "MatMul's case 9"; @@ -378,7 +378,7 @@ void MatMulFunction(const Context& dev_ctx, y_data, x_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 10"; blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, @@ -390,7 +390,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), out_batch_size, 0, K * N); @@ -407,7 +407,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data()); + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 12"; blas.BatchedGEMM(CblasTrans, @@ -419,7 +419,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), out_batch_size, M * K, 0); @@ -435,7 +435,7 @@ void MatMulFunction(const Context& dev_ctx, x_data, y_data, static_cast(flag), - Out->data(), + dev_ctx.template Alloc(Out), out_batch_size, M * K, K * N); @@ -454,7 +454,7 @@ void MatMulFunction(const Context& dev_ctx, x_ptr[i] = x_data + x_index * M * K; y_ptr[i] = y_data + y_index * K * N; - out_ptr[i] = Out->data() + i * M * N; + out_ptr[i] = dev_ctx.template Alloc(Out) + i * M * N; IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data()); } VLOG(3) << "MatMul's case 14"; diff --git a/paddle/pten/kernels/impl/sign_kernel_impl.h b/paddle/pten/kernels/impl/sign_kernel_impl.h index 54c1464c9e0221d5cc17c0db29fd7c2ce5ebf0f1..87efacccc97c5449ae54f44925b283f0a4a20ba6 100644 --- a/paddle/pten/kernels/impl/sign_kernel_impl.h +++ b/paddle/pten/kernels/impl/sign_kernel_impl.h @@ -26,7 +26,7 @@ template void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); auto eigen_out = pten::EigenVector::Flatten(*out); auto eigen_x = pten::EigenVector::Flatten(x); diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc index 4b706e9e685b47af20dd23ab0855db6116623c46..a76dfb09a0ea4ad4f531c323e015c8cdccdd165d 100644 --- a/paddle/pten/kernels/reshape_kernel.cc +++ b/paddle/pten/kernels/reshape_kernel.cc @@ -32,7 +32,7 @@ void ReshapeKernel(const Context& dev_ctx, return; } out->set_meta(out_meta); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.Alloc(out); pten::Copy(dev_ctx, x, false, out); out->Resize(out_meta.dims); out->ResetLoD(x.lod()); diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc index 56b79061f75f680cfc82b54d18733769b50b07b3..56ad19f0cc3dde250d10614165b52c447fa2b744 100644 --- a/paddle/pten/kernels/xpu/copy_kernel.cc +++ b/paddle/pten/kernels/xpu/copy_kernel.cc @@ -30,7 +30,7 @@ void Copy(const Context& dev_ctx, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); - auto* dst_ptr = dst->mutable_data(dev_ctx.GetPlace()); + auto* dst_ptr = dev_ctx.Alloc(dst); const auto& src_place = src.place(); const auto& dst_place = dst->place(); diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc index c9d376b81a630c86a976f991d4edf693312f72ba..33d27ca5b1c9a91c577252e7f1596f4c163692d3 100644 --- a/paddle/pten/tests/kernels/test_cast_dev_api.cc +++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/cast_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/common/data_type.h" #include "paddle/pten/core/dense_tensor.h" @@ -48,6 +49,11 @@ TEST(DEV_API, cast) { } pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + pten::DataType out_dtype = pten::DataType::FLOAT64; // 2. test API auto out = pten::Cast(dev_ctx, dense_x, out_dtype); diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc index 6f9ea1b0d990ae9e4d789bc4c37fb104c730fe82..eb546e992e953840fc43daf0e40178f06abe108b 100644 --- a/paddle/pten/tests/kernels/test_concat_dev_api.cc +++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/concat_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -56,6 +57,10 @@ TEST(DEV_API, concat) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Concat(dev_ctx, inputs, 0); // 3. check result diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc index 6714b57105bd24ca292e184d1ff90cf7d82e1b92..e43769dfb2b01116796e3d289f9d0d2dd9f772b0 100644 --- a/paddle/pten/tests/kernels/test_conj_dev_api.cc +++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/complex_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -44,6 +45,10 @@ TEST(DEV_API, conj) { } pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Conj(dev_ctx, dense_x); diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc index 01dfa925d6c5a1a59f0ae28cd1b28127221ec950..29f68513fa77e71993ba37d5ca94ccda67b1dcd5 100644 --- a/paddle/pten/tests/kernels/test_copy_dev_api.cc +++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/copy_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" @@ -57,6 +58,10 @@ TEST(DEV_API, copy) { std::cout << typeid(a).name() << std::endl; // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get()); // 3. check result diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc index 17416d33473d07b227cad38f74bce9c47dd8d520..8b37c41d0b55b2b6bd6375301563fcb46790f783 100644 --- a/paddle/pten/tests/kernels/test_creation_dev_api.cc +++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/pten/kernels/empty_kernel.h" #include "paddle/pten/kernels/full_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -32,6 +33,10 @@ using DDim = pten::framework::DDim; TEST(DEV_API, empty) { // 1. create input pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Empty(dev_ctx, {3, 2}, pten::DataType::INT32); @@ -58,6 +63,10 @@ TEST(DEV_API, empty_like) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::EmptyLike(dev_ctx, dense_x); // 3. check result @@ -74,6 +83,10 @@ TEST(DEV_API, full) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Full(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32); // 3. check result @@ -103,6 +116,10 @@ TEST(DEV_API, full_like) { float val = 1.0; pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::FullLike(dev_ctx, dense_x, val); diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc index 27fecd3fcd9e38cdf48d96cc83f5d26705adc906..c1f7d6aaba39b91e0820c88850a223c6221f60d4 100644 --- a/paddle/pten/tests/kernels/test_dot_dev_api.cc +++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/dot_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -57,6 +58,10 @@ TEST(DEV_API, dot) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Dot(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc index b3948843ee86c56987233bd5238edc3611a0fe9e..9d4c86f02679db03da14e26e52d65e37fdad21ab 100644 --- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/math_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -59,6 +60,10 @@ TEST(DEV_API, add) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Add(dev_ctx, dense_x, dense_y); // 3. check result @@ -107,6 +112,10 @@ TEST(DEV_API, subtract) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Subtract(dev_ctx, dense_x, dense_y); // 3. check result @@ -155,6 +164,10 @@ TEST(DEV_API, divide) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Divide(dev_ctx, dense_x, dense_y); // 3. check result @@ -203,6 +216,10 @@ TEST(DEV_API, multiply) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto dense_out = pten::Multiply(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc index fc463d1ff1e1cdaeee1641a2c88f621b0a12c4de..2ebf10916becc257a4352c454248a1e659a26309 100644 --- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc +++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/flatten_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -55,6 +56,10 @@ TEST(DEV_API, flatten) { } int start_axis = 1, stop_axis = 2; pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Flatten(dev_ctx, dense_x, start_axis, stop_axis); diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc index 40419ecb3ad936d78eef9bfd7b0c6d0aff93d64c..87c91b10081b985d85d079f144c257185329715a 100644 --- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc +++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/matmul_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -54,6 +55,10 @@ TEST(DEV_API, dot) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = Matmul(dev_ctx, dense_x, dense_y, false, false); // 3. check result diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc index 786492d3a1b1bdf462fa82f76d919cbc4d47a623..3abf54d26af31ccd315eb63b231202fba83ea1fa 100644 --- a/paddle/pten/tests/kernels/test_mean_dev_api.cc +++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/math_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -47,6 +48,10 @@ TEST(DEV_API, mean) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Mean(dev_ctx, dense_x, dims, false); // 3. check result diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc index ac2bb60cf9fe6b97e7d8dbb8e9204aa2c08335f9..fe9b09c25557c30b8995f3d8ffd8be03e0fe2cdb 100644 --- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc +++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/reshape_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -47,6 +48,10 @@ TEST(DEV_API, reshape) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Reshape(dev_ctx, dense_x, shape); // 3. check result std::vector expect_shape = {12, 3}; diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc index abb592cde3ff4276f9b0dbce3afb9d912a2e0f9f..80f12950094b794485e577e1e6920e00b136b55a 100644 --- a/paddle/pten/tests/kernels/test_scale_dev_api.cc +++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/pten/kernels/scale_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" @@ -47,6 +48,10 @@ TEST(DEV_API, scale) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); @@ -85,6 +90,10 @@ TEST(DEV_API, scale_host) { // 2. test API pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); auto out = pten::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc index 595f0b96920ae24b2daadeca8e749d0232627720..9b48d8908ff2306fa9466874d0edcde8864e9379 100644 --- a/paddle/pten/tests/kernels/test_sum_dev_api.cc +++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc @@ -17,10 +17,10 @@ limitations under the License. */ #include "paddle/pten/kernels/math_kernel.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" - namespace pten { namespace tests { @@ -46,6 +46,10 @@ TEST(DEV_API, sum) { std::vector axis = {0, 1}; pten::CPUContext dev_ctx; + dev_ctx.SetDeviceAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); // 2. test API auto out = pten::Sum(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false); diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index 48f95472c7ec71c4ed71bd80f2d5430b04636813..1e856a0fe900fca423333f2d859af40db49e8f24 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -13,6 +13,7 @@ # limitations under the License. import os +os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true" import sys import unittest import paddle