未验证 提交 4aa931dd 编写于 作者: Z Zeng Jinle 提交者: GitHub

Code clean of Allocator (#17602)

* Revert "Revert "Fix allocator bug""

This reverts commit 174d0d0b.

* Revert "fix travis ci"

This reverts commit 5656fa9f.

test=develop

* add inlined_vector.h, test=develop

* add inlined_vector_test,test=develop

* clean code of allocator,test=develop

* delete zero_size_allocator.h,test=develop

* fix failed unittest,test=develop
上级 430e2565
......@@ -28,12 +28,14 @@ namespace framework {
TEST(LoD, PrintLoDTensor) {
LoDTensor tensor1;
tensor1.Resize({2});
tensor1.mutable_data<float>(platform::CPUPlace());
tensor1.data<float>()[0] = 0.2;
tensor1.data<float>()[1] = 0.5;
LOG(INFO) << tensor1;
LoDTensor tensor2;
tensor2.Resize({2});
tensor2.mutable_data<int64_t>(platform::CPUPlace());
tensor2.data<int64_t>()[0] = 1;
tensor2.data<int64_t>()[1] = 2;
......
......@@ -384,7 +384,7 @@ TEST_F(MkldnnQuantizerTest, histogram_empty) {
// zero tensor
framework::LoDTensor var_tensor;
var_tensor.Resize({0});
ASSERT_TRUE(var_tensor.mutable_data<double>(platform::CPUPlace()));
var_tensor.mutable_data<double>(platform::CPUPlace());
ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
}
......
......@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU)
......@@ -38,7 +37,7 @@ else ()
set(AllocatorFacadeDeps)
endif()
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator)
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class AllocationWithUnderlying : public Allocation {
public:
explicit AllocationWithUnderlying(AllocationPtr allocation)
: Allocation(allocation->ptr(), allocation->size(), allocation->place()),
allocation_(std::move(allocation)) {}
AllocationPtr allocation_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -52,7 +52,7 @@ class Allocator;
* decorate a RetryAllocator to any allocator to perform allocation retry when
* first allocation request fails.
*
* Explanations of Allocator design is as follows:
* Explanations of Allocator design are as follows:
*
* Suppose we have an allocator which is decorated by several allocators:
*
......@@ -127,8 +127,15 @@ class Allocation {
size_t size_;
platform::Place place_;
// NOTE(zjl): Since decorated_allocators_ is usually a small vector
// We reserve a small buffer to it to prevent frequent heap allocation
/**
* NOTE(zjl): Since decorated_allocators_ is usually a small vector.
* We reserve a small buffer to it to prevent frequent heap allocation
*
* Instead, we can use a std::vector<Allocator *> here, and reserve
* kReserveAllocatorNum in constructor of Allocation.
* But using std::vector<Allocator *> would make ocr recognition model
* fail in CE. The train duration is 8% slower than KPI.
*/
static constexpr size_t kReserveAllocatorNum = 8;
using DecoratedAllocatorStack =
framework::InlinedVector<Allocator*, kReserveAllocatorNum>;
......
......@@ -29,7 +29,6 @@
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
......@@ -192,10 +191,6 @@ class CUDAPinnedChunkedAllocator : public ChunkedAllocator {
class AllocatorFacadePrivate {
public:
std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
~AllocatorFacadePrivate() = default;
AllocatorFacadePrivate() {
auto strategy = GetAllocatorStrategy();
switch (strategy) {
......@@ -207,7 +202,6 @@ class AllocatorFacadePrivate {
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
break;
}
default: {
......@@ -215,6 +209,18 @@ class AllocatorFacadePrivate {
static_cast<int>(strategy));
}
}
InitZeroSizeAllocators();
}
inline const std::shared_ptr<Allocator>& GetAllocator(
const platform::Place& place, size_t size) {
const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_);
auto iter = allocators.find(place);
if (iter == allocators.end()) {
throw BadAlloc(
string::Sprintf("No such allocator for the place, %s", place));
}
return iter->second;
}
private:
......@@ -252,12 +258,40 @@ class AllocatorFacadePrivate {
#endif
}
void WrapZeroSizeAllocator() {
for (auto& pair : allocators_) {
pair.second =
std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
class ZeroSizeAllocator : public Allocator {
public:
explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
return new Allocation(nullptr, 0, place_);
}
void FreeImpl(Allocation* allocation) override { delete allocation; }
private:
platform::Place place_;
};
void InitZeroSizeAllocators() {
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
int device_count = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::CUDAPlace(dev_id));
}
places.emplace_back(platform::CUDAPinnedPlace());
#endif
for (auto& p : places) {
zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
}
}
private:
std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
};
// Pimpl. Make interface clean.
......@@ -276,12 +310,7 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
Allocator::Attr attr) {
auto it = m_->allocators_.find(place);
if (it == m_->allocators_.end()) {
throw BadAlloc(
string::Sprintf("No such allocator for the place, %s", place));
}
return m_->allocators_.at(place)->Allocate(size, attr);
return m_->GetAllocator(place, size)->Allocate(size, attr);
}
} // namespace allocation
......
......@@ -16,7 +16,6 @@
#include <algorithm>
#include <limits>
#include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
namespace paddle {
namespace memory {
......
......@@ -15,7 +15,6 @@
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT
#include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle {
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
namespace paddle {
namespace memory {
namespace allocation {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
bool ZeroSizeAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (size == 0) {
return new Allocation(nullptr, 0, place_);
} else {
return underlying_allocator_->Allocate(size, attr).release();
}
}
void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
if (allocation->size() == 0) {
delete allocation;
} else {
underlying_allocator_->Free(allocation);
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr
class ZeroSizeAllocator : public Allocator {
public:
ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
const platform::Place& p)
: underlying_allocator_(std::move(underlying_allocator)), place_(p) {}
bool IsAllocThreadSafe() const override;
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
void FreeImpl(Allocation* allocation) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
const platform::Place& place_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......@@ -24,6 +25,7 @@ template <>
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace,
const void* src, size_t num) {
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
......@@ -40,6 +42,7 @@ template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device);
if (stream) {
......@@ -59,6 +62,8 @@ template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
......@@ -77,6 +82,8 @@ template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
if (UNLIKELY(num == 0)) return;
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
if (stream) {
......@@ -103,6 +110,7 @@ template <>
void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
platform::CPUPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
......@@ -110,6 +118,7 @@ template <>
void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CPUPlace src_place, const void* src, size_t num) {
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
......@@ -117,6 +126,7 @@ template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
......@@ -125,6 +135,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
......@@ -140,6 +151,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
......
......@@ -454,6 +454,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_variance = ctx.Input<Tensor>("Variance");
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C});
T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
......
......@@ -264,8 +264,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
framework::Tensor need_find_max, out_size;
int* find_max = need_find_max.mutable_data<int>(gpu_place);
int* out_size_data = out_size.mutable_data<int>(gpu_place);
int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册