提交 31270e58 编写于 作者: Y Yu Yang

Add communication attr

上级 8e3fdc6e
......@@ -32,6 +32,7 @@ size_t Tensor::memory_size() const {
}
void* Tensor::mutable_data(platform::Place place, std::type_index type,
memory::Allocator::Attr attr,
size_t requested_size) {
type_ = type;
PADDLE_ENFORCE_GE(numel(), 0,
......@@ -46,17 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
/* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
holder_ = memory::AllocShared(place, size);
holder_ = memory::AllocShared(place, size, attr);
offset_ = 0;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
size_t requested_size) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing.");
return mutable_data(place, type_, requested_size);
return mutable_data(place, type_, attr, requested_size);
}
Tensor& Tensor::ShareDataWith(const Tensor& src) {
......
......@@ -84,12 +84,17 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
T* mutable_data(platform::Place place, size_t requested_size = 0);
T* mutable_data(platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
void* mutable_data(platform::Place place, std::type_index type,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
void* mutable_data(platform::Place place, size_t requested_size = 0);
void* mutable_data(platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
/**
* @brief Return a pointer to mutable memory block.
......@@ -101,7 +106,9 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
T* mutable_data(DDim dims, platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
/*! Return the dimensions of the memory block. */
const DDim& dims() const;
......
......@@ -47,16 +47,20 @@ inline T* Tensor::data() {
template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place,
memory::Allocator::Attr attr,
size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims);
return mutable_data<T>(place, requested_size);
return mutable_data<T>(place, attr, requested_size);
}
template <typename T>
inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
inline T* Tensor::mutable_data(platform::Place place,
memory::Allocator::Attr attr,
size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
return reinterpret_cast<T*>(
mutable_data(place, typeid(T), attr, requested_size));
}
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
......
......@@ -25,9 +25,9 @@ endif()
cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocator)
cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator)
else ()
set(AllocatorFacadeDeps)
endif()
......
......@@ -60,7 +60,8 @@ class Allocator {
kFixedHuge = 2,
kFluxHuge = 3,
kTmp = 4,
NumOfAttrs = 5
kCommunication = 5,
NumOfAttrs = 6
};
virtual ~Allocator();
......
......@@ -21,6 +21,7 @@
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
......@@ -32,6 +33,35 @@ namespace paddle {
namespace memory {
namespace allocation {
class CPUManagedAllocator : public ManagedAllocator {
public:
CPUManagedAllocator()
: normal_allocator_(NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUAllocator()))),
communication_allocator_(NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
if (attr == kCommunication) {
return communication_allocator_->Allocate(size, attr);
} else {
return normal_allocator_->Allocate(size, attr);
}
}
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
if (attr == kCommunication) {
return communication_allocator_->AllocateShared(size, attr);
} else {
return normal_allocator_->AllocateShared(size, attr);
}
}
private:
std::shared_ptr<ManagedAllocator> normal_allocator_;
std::shared_ptr<ManagedAllocator> communication_allocator_;
};
class AllocatorFacadePrivate {
public:
std::map<platform::Place, std::shared_ptr<ManagedAllocator>> allocators_;
......@@ -52,10 +82,7 @@ class AllocatorFacadePrivate {
private:
void InitCPUAllocator() {
auto all = NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUAllocator()));
allocators_[platform::CPUPlace()] = all;
allocators_[platform::CPUPlace()] = std::make_shared<CPUManagedAllocator>();
}
void InitCUDAAllocator() {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include <cuda.h>
#include <cuda_runtime.h>
namespace paddle {
namespace memory {
namespace allocation {
std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
Allocator::Attr attr) {
PADDLE_ENFORCE_EQ(
attr, kCommunication,
"CPUPinnedAllocator should be used for Cross-Device Communication");
void* ptr;
PADDLE_ENFORCE(cudaMallocHost(&ptr, size));
return std::unique_ptr<CPUPinnedAllocation>(
new CPUPinnedAllocation(ptr, size));
}
void CPUPinnedAllocator::Free(Allocation* allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation*>(allocation));
PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
}
bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class CPUPinnedAllocation : public Allocation {
public:
CPUPinnedAllocation(void* ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {}
};
class CPUPinnedAllocator : public UnmanagedAllocator {
public:
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override;
void Free(Allocation* allocation) override;
bool IsAllocThreadSafe() const override;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -303,7 +303,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
int groups = ctx.Attr<int>("groups");
// TODO: add support for dilation
// TODO: add support for dilation // NOLINT
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
......@@ -386,8 +386,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data));
T* output_data =
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
T* output_data = output->mutable_data<T>(
ctx.GetPlace(), paddle::memory::Allocator::kDefault,
handler.GetDstMemorySize());
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
......@@ -626,7 +627,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
user_diff_dst_memory_p, pipeline);
const size_t size = handler.GetDiffWeightsMemorySize();
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
filter_grad_data = filter_grad->mutable_data<T>(
ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
auto diff_weights_memory_p =
handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
......@@ -651,7 +653,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pipeline);
const size_t size = handler.GetDiffSourceMemorySize();
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
input_grad_data = input_grad->mutable_data<T>(
ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
reinterpret_cast<void*>(input_grad_data));
......
......@@ -112,17 +112,16 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
}
}
// TODO(dzhwinter) : fix the redundent Tensor allocate and free
// TODO(dzhwinter) : fix the redundant Tensor allocate and free
template <typename T>
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self->place())) {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopySync(*self, platform::CPUPlace(), dst.get());
dst->data<T>()[offset] = elem;
framework::TensorCopySync(*dst.get(), self->place(), self);
framework::Tensor dst;
framework::TensorCopySync(*self, platform::CPUPlace(), &dst);
dst.mutable_data<T>(platform::CPUPlace())[offset] = elem;
framework::TensorCopySync(dst, self->place(), self);
} else if (platform::is_cpu_place(self->place())) {
self->data<T>()[offset] = elem;
self->mutable_data<T>(self->place())[offset] = elem;
}
}
......
......@@ -113,7 +113,7 @@ class TestConv2dOp(OpTest):
return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place(
place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
place, {'Input', 'Filter'}, 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self):
if self.dtype == np.float16:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册