提交 15076c32 编写于 作者: Y Yu Yang

Add comments and polish code style

上级 b4f54d33
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <vector> #include <vector>
#include "../memory/allocation/allocator.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
namespace paddle { namespace paddle {
...@@ -111,8 +112,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -111,8 +112,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
dst->set_layout(src.layout()); dst->set_layout(src.layout());
auto src_place = src.place(); auto src_place = src.place();
auto src_ptr = src.data<void>(); auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(dst_place, src.type(), auto dst_ptr =
memory::Allocator::kCommunication); dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......
...@@ -21,6 +21,11 @@ namespace allocation { ...@@ -21,6 +21,11 @@ namespace allocation {
ThinAlignedAllocator::ThinAlignedAllocator( ThinAlignedAllocator::ThinAlignedAllocator(
std::shared_ptr<ManagedAllocator> underlyning_allocator) std::shared_ptr<ManagedAllocator> underlyning_allocator)
: underlying_allocator_(std::move(underlyning_allocator)) {} : underlying_allocator_(std::move(underlyning_allocator)) {}
std::shared_ptr<Allocation> ThinAlignedAllocator::AllocateShared(
size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Allocate(size, attr).release());
}
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -20,34 +20,66 @@ namespace paddle { ...@@ -20,34 +20,66 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// The aligned allocation and allocator will wrap a managed allocator,
// and returns the aligned pointer.
//
// NOTE(yy): For speed reason, I just use a template parameter to get
// alignment, however, it can be an private member if necessary.
//
// NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
template <size_t kAlignment> template <size_t kAlignment>
class AlignedAllocation : public Allocation { class AlignedAllocation : public Allocation {
public: public:
AlignedAllocation(std::unique_ptr<Allocation>&& underlying_allocation, AlignedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
size_t size) size_t size)
: Allocation(AlignedPtr(underlying_allocation->ptr()), size, : Allocation(AlignedPtr(underlying_allocation->ptr()),
size + kAlignment - Offset(underlying_allocation->ptr()),
underlying_allocation->place()), underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)) {} underlying_allocation_(std::move(underlying_allocation)) {}
private: private:
static void* AlignedPtr(void* ptr) { static void* AlignedPtr(void* ptr) {
auto ptr_addr = reinterpret_cast<uintptr_t>(ptr); return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment; Offset(ptr));
return reinterpret_cast<void*>(ptr_addr); }
// Offset to aligned pointer.
// if ptr is already aligned, returns 0.
static size_t Offset(void* ptr) {
auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
intptr_t diff = aligned_addr - ptr_addr;
if (diff == 0) {
return 0;
} else {
return kAlignment + diff;
}
} }
std::unique_ptr<Allocation> underlying_allocation_; std::unique_ptr<Allocation> underlying_allocation_;
}; };
// Thin aligned allocator is trivial and used to generate a small size binary.
//
// NOTE(yy): This is a trick to make a template class. This class extract the
// common code into a `thin` class. So if there are multiple specification of
// the template class, the binary size will not extended too much.
//
// NOTE(yy): This could be an over design. If it harms readability of code, it
// could be removed later.
class ThinAlignedAllocator : public ManagedAllocator { class ThinAlignedAllocator : public ManagedAllocator {
public: public:
explicit ThinAlignedAllocator( explicit ThinAlignedAllocator(
std::shared_ptr<ManagedAllocator> underlyning_allocator); std::shared_ptr<ManagedAllocator> underlyning_allocator);
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
protected: protected:
std::shared_ptr<ManagedAllocator> underlying_allocator_; std::shared_ptr<ManagedAllocator> underlying_allocator_;
}; };
// An aligned allocator will allocate `size+kAlignment` allocation and adjust
// the pointer offset.
template <size_t kAlignment> template <size_t kAlignment>
class AlignedAllocator : public ThinAlignedAllocator { class AlignedAllocator : public ThinAlignedAllocator {
public: public:
...@@ -58,9 +90,6 @@ class AlignedAllocator : public ThinAlignedAllocator { ...@@ -58,9 +90,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
return std::unique_ptr<Allocation>( return std::unique_ptr<Allocation>(
new AlignedAllocation<kAlignment>(std::move(raw_allocation), size)); new AlignedAllocation<kAlignment>(std::move(raw_allocation), size));
} }
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
return std::shared_ptr<Allocation>(Allocate(size, attr).release());
}
}; };
} // namespace allocation } // namespace allocation
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
// NOTE(yy): this unittest is not important. It just used for debugging.
// It can be removed later.
struct FillZero { struct FillZero {
public: public:
float* ptr_; float* ptr_;
......
...@@ -12,6 +12,22 @@ ...@@ -12,6 +12,22 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <utility>
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once #pragma once
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -21,15 +37,22 @@ namespace paddle { ...@@ -21,15 +37,22 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// Exception when `Alloc`/`AllocShared` failed
class BadAlloc : public std::exception { class BadAlloc : public std::exception {
public: public:
explicit BadAlloc(const std::string& msg) : msg_(msg) {} explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
const char* what() const noexcept override; const char* what() const noexcept override;
private: private:
std::string msg_; std::string msg_;
}; };
// Allocation is the object holding the actually pointer. Use
// `Allocation::ptr()` will returns the pointer that allocated.
//
// NOTE: this is the base class of Allocation. Each allocator can use its own
// allocation object.
// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
class Allocation { class Allocation {
public: public:
Allocation(void* ptr, size_t size, platform::Place place) Allocation(void* ptr, size_t size, platform::Place place)
...@@ -38,8 +61,22 @@ class Allocation { ...@@ -38,8 +61,22 @@ class Allocation {
Allocation(const Allocation& o) = delete; Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete;
// Returns the holding pointer.
// NOTE: For performance consideration, it is better not to make this method
// as a virtual method. If we want to implement a `defragmentation` later,
// we might need to make `ptr_` field as a protected field, and add a virtual
// method like `defragmentation` to change `ptr_`.
void* ptr() const { return ptr_; } void* ptr() const { return ptr_; }
// Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
// last valid element.
//
// NOTE: Some allocator might alloc more memory than request. The size
// could larger than its request. For example,
// the AlignedAllocator will always allocate memory as size + kAlignment.
// The raw pointer might not aligned, so an offset might be added to raw
// the pointer. The size of this allocation will be
// `size + kAlignemnt - offset`.
size_t size() const { return size_; } size_t size() const { return size_; }
const platform::Place& place() const { return place_; } const platform::Place& place() const { return place_; }
...@@ -52,22 +89,51 @@ class Allocation { ...@@ -52,22 +89,51 @@ class Allocation {
platform::Place place_; platform::Place place_;
}; };
// Base interface class of memory Allocator.
// To allocate a memory, allocator needs two parameters:
// 1. size of bytes.
// 2. Attribute of memory.
// NOTE: the attribute of memory might be ignored if the allocator does not
// care it.
class Allocator { class Allocator {
public: public:
enum Attr { enum Attr {
kDefault = 0, kDefault = 0, // Default attribute. Uses the fast or stablest allocation
kTiny = 1, // algorithm.
kFixedHuge = 2,
kFluxHuge = 3, kFixedHuge = 1, // The allocation may not be freed until the program
kTmp = 4, // ends. e.g., `Parameters` and `Momentum`.
kCommunication = 5,
NumOfAttrs = 6 kFluxHuge = 2, // The allocation may create and freed frequently and the
// allocation is considerable huge. Like `activations`
// and gradients.
kScratchpad =
3, // The `Scratchpad` memory is allocated and freed very soon,
// usually within an operator or aux memory.
// Like CUDNN workspace, AUX memory in batch norm, etc.
//
// https://en.wikipedia.org/wiki/Scratchpad_memory
kCrossDevice =
4, // The memory used cross-device memory copy/communication.
// For example:
// 1. it can use an `pinned` memory for CPU-GPU
// communication.
// 2. it can use an `registered` memory for RDMA
// communication.
NumOfAttrs = 5 // The number of all attributes. It is used internally.
}; };
virtual ~Allocator(); virtual ~Allocator();
// Allocate an allocation. Note the return allocation might need to be freed
// manually if the Allocator is an `UnmanagedAllocator`.
virtual std::unique_ptr<Allocation> Allocate( virtual std::unique_ptr<Allocation> Allocate(
size_t size, Allocator::Attr attr = kDefault) = 0; size_t size, Allocator::Attr attr = kDefault) = 0;
// True if the `Allocate` is thread safe.
virtual bool IsAllocThreadSafe() const; virtual bool IsAllocThreadSafe() const;
}; };
...@@ -82,7 +148,8 @@ class UnmanagedAllocator : public Allocator { ...@@ -82,7 +148,8 @@ class UnmanagedAllocator : public Allocator {
} }
}; };
// The allocation will be managed by smart pointers // The allocation will be managed by smart pointers. i.e., users do not need
// to free allocation manually.
class ManagedAllocator : public Allocator { class ManagedAllocator : public Allocator {
public: public:
virtual std::shared_ptr<Allocation> AllocateShared( virtual std::shared_ptr<Allocation> AllocateShared(
......
...@@ -46,7 +46,7 @@ class CPUManagedAllocator : public ManagedAllocator { ...@@ -46,7 +46,7 @@ class CPUManagedAllocator : public ManagedAllocator {
std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {} std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override { std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
if (attr == kCommunication) { if (attr == kCrossDevice) {
return communication_allocator_->Allocate(size, attr); return communication_allocator_->Allocate(size, attr);
} else { } else {
return normal_allocator_->Allocate(size, attr); return normal_allocator_->Allocate(size, attr);
...@@ -54,7 +54,7 @@ class CPUManagedAllocator : public ManagedAllocator { ...@@ -54,7 +54,7 @@ class CPUManagedAllocator : public ManagedAllocator {
} }
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override { std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
if (attr == kCommunication) { if (attr == kCrossDevice) {
return communication_allocator_->AllocateShared(size, attr); return communication_allocator_->AllocateShared(size, attr);
} else { } else {
return normal_allocator_->AllocateShared(size, attr); return normal_allocator_->AllocateShared(size, attr);
......
...@@ -24,6 +24,10 @@ namespace allocation { ...@@ -24,6 +24,10 @@ namespace allocation {
// Allocator Facade is the interface exposed to other modules. // Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should // All the configuration or dirty code under development should
// be hidden behind this facade. // be hidden behind this facade.
//
// NOTE(yy): This class is a singleton class.
// NOTE(yy): To create a stable ABI and make compilation faster. Here we use
// a Pimpl trick;
class AllocatorFacadePrivate; class AllocatorFacadePrivate;
class AllocatorFacade { class AllocatorFacade {
public: public:
...@@ -33,13 +37,16 @@ class AllocatorFacade { ...@@ -33,13 +37,16 @@ class AllocatorFacade {
static AllocatorFacade& Instance(); static AllocatorFacade& Instance();
// Allocate a shared allocation.
std::shared_ptr<Allocation> AllocShared( std::shared_ptr<Allocation> AllocShared(
const platform::Place& place, size_t size, const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault); Allocator::Attr attr = Allocator::kDefault);
// Allocate a unique allocation.
std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size, std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault); Allocator::Attr attr = Allocator::kDefault);
// TODO(yy): Allocate a Copy-On-Write allocation?
private: private:
AllocatorFacade(); AllocatorFacade();
AllocatorFacadePrivate* m_; AllocatorFacadePrivate* m_;
......
...@@ -24,12 +24,27 @@ namespace paddle { ...@@ -24,12 +24,27 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// The AutoIncrementAllocator manages many underlying allocators. If none of
// them can allocate the request memory, a new allocator will be created and
// invoke its `allocate` method.
//
// NOTE(yy): The AutoIncrementAllocator will prefer to allocate memory from
// the latest sucessful allocator.
//
// NOTE(yy): We may need to release an underlying allocator if it allocate
// nothing. However, it is generally not useful, since it will make performance
// undetermined.
//
// NOTE(yy): This allocator is only locked when creating new underlying
// allocator. The allocation requests from many threads may be dispatched
// to the same underlying allocator. So the underlying allocator must be
// thread safe.
class AutoIncrementAllocator : public ManagedAllocator { class AutoIncrementAllocator : public ManagedAllocator {
public: public:
// Creator is the method to create ManagedAllocator
using AllocatorCreator = std::function<std::shared_ptr<ManagedAllocator>()>; using AllocatorCreator = std::function<std::shared_ptr<ManagedAllocator>()>;
template <typename Creator> explicit AutoIncrementAllocator(AllocatorCreator&& creator)
explicit AutoIncrementAllocator(Creator&& creator)
: creator_(std::move(creator)), prev_success_allocator_{0} {} : creator_(std::move(creator)), prev_success_allocator_{0} {}
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override; std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override;
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override; std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
...@@ -65,6 +80,11 @@ class AutoIncrementAllocator : public ManagedAllocator { ...@@ -65,6 +80,11 @@ class AutoIncrementAllocator : public ManagedAllocator {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
underlying_allocators_.emplace_back(creator_()); underlying_allocators_.emplace_back(creator_());
prev_success_allocator_ = underlying_allocators_.size() - 1; prev_success_allocator_ = underlying_allocators_.size() - 1;
PADDLE_ENFORCE(
underlying_allocators_[prev_success_allocator_]->IsAllocThreadSafe(),
"the underlying allocator must be thread safe. This is a program "
"bug.");
return callback(*underlying_allocators_[prev_success_allocator_]); return callback(*underlying_allocators_[prev_success_allocator_]);
} }
} }
......
...@@ -22,6 +22,22 @@ namespace paddle { ...@@ -22,6 +22,22 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// A composite allocator who will dispatch the allocation request by registered
// condition.
//
// For example:
//
// auto* cond_allocator = new ConditionalAllocator();
// cond_allocator->AddAllocator([](size_t size, Attr attr){
// // if size > 10
// return size > 10;
// }, allocator_a).AddAllocator([](size_t size, Attr attr){
// // elif attr is kDefault
// return attr == kDefault;
// }, allocator_b).AddAllocator([](size_t size, Attr attr){
// // else
// return true;
// }, allocator_c);
class ConditionalAllocator : public ManagedAllocator { class ConditionalAllocator : public ManagedAllocator {
public: public:
ConditionalAllocator() = default; ConditionalAllocator() = default;
......
...@@ -18,7 +18,13 @@ ...@@ -18,7 +18,13 @@
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// CPU system allocator and allocation.
//
// NOTE(yy): Should we just use `malloc` here since there is an
// aligned_allocator.
//
// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
// an open-sourced allocator into Paddle.
class CPUAllocation : public Allocation { class CPUAllocation : public Allocation {
public: public:
CPUAllocation(void* ptr, size_t size) CPUAllocation(void* ptr, size_t size)
......
...@@ -20,6 +20,7 @@ namespace paddle { ...@@ -20,6 +20,7 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// CUDA System allocator and allocation.
// Just a flag type. // Just a flag type.
class CUDAAllocation : public Allocation { class CUDAAllocation : public Allocation {
public: public:
......
...@@ -20,6 +20,7 @@ namespace paddle { ...@@ -20,6 +20,7 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// A allocator to make underlying allocator thread safe.
class LockedAllocator : public UnmanagedAllocator { class LockedAllocator : public UnmanagedAllocator {
public: public:
explicit LockedAllocator(std::unique_ptr<Allocator>&& underlying_allocator); explicit LockedAllocator(std::unique_ptr<Allocator>&& underlying_allocator);
......
...@@ -20,6 +20,11 @@ namespace paddle { ...@@ -20,6 +20,11 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// An allocator to wrap an UnmanagedAllocator and make the allocation managed
// by C++ smart ptr.
//
// NOTE: if the NaiveManagedAllocator is destroyed before
// NaiveManagedAllocations, the allocation will never be released.
class NaiveManagedAllocator; class NaiveManagedAllocator;
class NaiveManagedAllocation : public Allocation { class NaiveManagedAllocation : public Allocation {
public: public:
......
...@@ -23,7 +23,7 @@ namespace allocation { ...@@ -23,7 +23,7 @@ namespace allocation {
std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size, std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
Allocator::Attr attr) { Allocator::Attr attr) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
attr, kCommunication, attr, kCrossDevice,
"CPUPinnedAllocator should be used for Cross-Device Communication"); "CPUPinnedAllocator should be used for Cross-Device Communication");
void* ptr; void* ptr;
......
...@@ -19,6 +19,7 @@ namespace paddle { ...@@ -19,6 +19,7 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// Allocator uses `cudaMallocHost`
class CPUPinnedAllocation : public Allocation { class CPUPinnedAllocation : public Allocation {
public: public:
CPUPinnedAllocation(void* ptr, size_t size) CPUPinnedAllocation(void* ptr, size_t size)
......
...@@ -22,6 +22,9 @@ namespace paddle { ...@@ -22,6 +22,9 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation { class ZeroSizeAllocation : public Allocation {
public: public:
explicit ZeroSizeAllocation(const platform::Place& p) explicit ZeroSizeAllocation(const platform::Place& p)
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include <stdio.h> #include <stdio.h>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -70,7 +71,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, ...@@ -70,7 +71,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
// Allocate temporary storage // Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
auto d_temp_storage = auto d_temp_storage =
memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp); memory::Alloc(place, temp_storage_bytes, memory::Allocator::kScratchpad);
// Run sorting operation // Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
......
...@@ -112,8 +112,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -112,8 +112,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
} }
void* allocate(size_t num_bytes) const override { void* allocate(size_t num_bytes) const override {
auto buf = auto buf = paddle::memory::Alloc(place_, num_bytes,
paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny); memory::Allocator::kScratchpad);
void* retv = buf->ptr(); void* retv = buf->ptr();
allocations_[buf->ptr()] = std::move(buf); allocations_[buf->ptr()] = std::move(buf);
return retv; return retv;
......
...@@ -64,7 +64,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -64,7 +64,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>()); auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace(), tensor.dims(), platform::CPUPlace(),
memory::Allocator::kCommunication)); memory::Allocator::kCrossDevice));
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
sizeof(CUR_TYPE) * tensor.numel(), sizeof(CUR_TYPE) * tensor.numel(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册