diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 15928c18d38b8a513b00f993b57faab43978bf53..d1554113bc366f38d1cfd7603e2848f618794d9f 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -28,12 +28,14 @@ namespace framework { TEST(LoD, PrintLoDTensor) { LoDTensor tensor1; + tensor1.Resize({2}); tensor1.mutable_data(platform::CPUPlace()); tensor1.data()[0] = 0.2; tensor1.data()[1] = 0.5; LOG(INFO) << tensor1; LoDTensor tensor2; + tensor2.Resize({2}); tensor2.mutable_data(platform::CPUPlace()); tensor2.data()[0] = 1; tensor2.data()[1] = 2; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 6bc892638c28ca0b5bab82936bf9700289bed6b2..6f57f8b20287d8427321aa17bf8c6d094d776802 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -384,7 +384,7 @@ TEST_F(MkldnnQuantizerTest, histogram_empty) { // zero tensor framework::LoDTensor var_tensor; var_tensor.Resize({0}); - ASSERT_TRUE(var_tensor.mutable_data(platform::CPUPlace())); + var_tensor.mutable_data(platform::CPUPlace()); ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet); } diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 4c4ae72effaaf2829267c23cf7bd26753a9fab85..c309febd49905104c259d71f5c56bf58b7294090 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) -cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) if (WITH_GPU) @@ -38,7 +37,7 @@ else () set(AllocatorFacadeDeps) endif() -list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator) +list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator) cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) diff --git a/paddle/fluid/memory/allocation/allocation_with_underlying.h b/paddle/fluid/memory/allocation/allocation_with_underlying.h deleted file mode 100644 index 69f78667d7d33c59245a9890b9a2ce469f629450..0000000000000000000000000000000000000000 --- a/paddle/fluid/memory/allocation/allocation_with_underlying.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/memory/allocation/allocator.h" - -namespace paddle { -namespace memory { -namespace allocation { - -class AllocationWithUnderlying : public Allocation { - public: - explicit AllocationWithUnderlying(AllocationPtr allocation) - : Allocation(allocation->ptr(), allocation->size(), allocation->place()), - allocation_(std::move(allocation)) {} - AllocationPtr allocation_; -}; - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 5acdd9d0fcba2d74a4b72e94c81b0b570b17acb1..1fcd9361805aa8a494684fdafc19013338092791 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -52,7 +52,7 @@ class Allocator; * decorate a RetryAllocator to any allocator to perform allocation retry when * first allocation request fails. * - * Explanations of Allocator design is as follows: + * Explanations of Allocator design are as follows: * * Suppose we have an allocator which is decorated by several allocators: * @@ -127,8 +127,15 @@ class Allocation { size_t size_; platform::Place place_; - // NOTE(zjl): Since decorated_allocators_ is usually a small vector - // We reserve a small buffer to it to prevent frequent heap allocation + /** + * NOTE(zjl): Since decorated_allocators_ is usually a small vector. + * We reserve a small buffer to it to prevent frequent heap allocation + * + * Instead, we can use a std::vector here, and reserve + * kReserveAllocatorNum in constructor of Allocation. + * But using std::vector would make ocr recognition model + * fail in CE. The train duration is 8% slower than KPI. + */ static constexpr size_t kReserveAllocatorNum = 8; using DecoratedAllocatorStack = framework::InlinedVector; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 09328aded58cb0cccd9de0aba399f5c49313042f..1ff719c9e726ac5bc3f39cd552fe5c53b22147a0 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -29,7 +29,6 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" -#include "paddle/fluid/memory/allocation/zero_size_allocator.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -192,10 +191,6 @@ class CUDAPinnedChunkedAllocator : public ChunkedAllocator { class AllocatorFacadePrivate { public: - std::map> allocators_; - - ~AllocatorFacadePrivate() = default; - AllocatorFacadePrivate() { auto strategy = GetAllocatorStrategy(); switch (strategy) { @@ -207,7 +202,6 @@ class AllocatorFacadePrivate { InitCPUAllocator(); InitCUDAAllocator(); InitCUDAPinnedAllocator(); - WrapZeroSizeAllocator(); break; } default: { @@ -215,6 +209,18 @@ class AllocatorFacadePrivate { static_cast(strategy)); } } + InitZeroSizeAllocators(); + } + + inline const std::shared_ptr& GetAllocator( + const platform::Place& place, size_t size) { + const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_); + auto iter = allocators.find(place); + if (iter == allocators.end()) { + throw BadAlloc( + string::Sprintf("No such allocator for the place, %s", place)); + } + return iter->second; } private: @@ -252,12 +258,40 @@ class AllocatorFacadePrivate { #endif } - void WrapZeroSizeAllocator() { - for (auto& pair : allocators_) { - pair.second = - std::make_shared(pair.second, pair.first); + class ZeroSizeAllocator : public Allocator { + public: + explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} + + protected: + Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override { + return new Allocation(nullptr, 0, place_); + } + + void FreeImpl(Allocation* allocation) override { delete allocation; } + + private: + platform::Place place_; + }; + + void InitZeroSizeAllocators() { + std::vector places; + places.emplace_back(platform::CPUPlace()); +#ifdef PADDLE_WITH_CUDA + int device_count = platform::GetCUDADeviceCount(); + for (int dev_id = 0; dev_id < device_count; ++dev_id) { + places.emplace_back(platform::CUDAPlace(dev_id)); + } + places.emplace_back(platform::CUDAPinnedPlace()); +#endif + + for (auto& p : places) { + zero_size_allocators_[p] = std::make_shared(p); } } + + private: + std::map> allocators_; + std::map> zero_size_allocators_; }; // Pimpl. Make interface clean. @@ -276,12 +310,7 @@ std::shared_ptr AllocatorFacade::AllocShared( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, Allocator::Attr attr) { - auto it = m_->allocators_.find(place); - if (it == m_->allocators_.end()) { - throw BadAlloc( - string::Sprintf("No such allocator for the place, %s", place)); - } - return m_->allocators_.at(place)->Allocate(size, attr); + return m_->GetAllocator(place, size)->Allocate(size, attr); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc index e04c0aa34b1cd6200806cc2a012161e3478eca0b..2f3e6205c3c1713756bce254c947f9cd500e3d46 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -16,7 +16,6 @@ #include #include #include -#include "paddle/fluid/memory/allocation/allocation_with_underlying.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index c43099cc88f839ad92d36774d49aafd7192f916f..e9ec39c893255fe297f38e68eedaa68f3e6496b0 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/memory/allocation/locked_allocator.h" #include // NOLINT #include -#include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/platform/lock_guard_ptr.h" namespace paddle { diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 7e888988f9602e362d73f64c1b45552e84e3349c..167dd923dbbe9d04861c015c013c1211046be76c 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/retry_allocator.h" -#include "paddle/fluid/memory/allocation/allocation_with_underlying.h" namespace paddle { namespace memory { namespace allocation { diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc deleted file mode 100644 index 39743bcb10c700c9a8446b9040c8a8707d57ec7d..0000000000000000000000000000000000000000 --- a/paddle/fluid/memory/allocation/zero_size_allocator.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/memory/allocation/zero_size_allocator.h" - -namespace paddle { -namespace memory { -namespace allocation { - -bool ZeroSizeAllocator::IsAllocThreadSafe() const { - return underlying_allocator_->IsAllocThreadSafe(); -} - -Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { - if (size == 0) { - return new Allocation(nullptr, 0, place_); - } else { - return underlying_allocator_->Allocate(size, attr).release(); - } -} - -void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { - if (allocation->size() == 0) { - delete allocation; - } else { - underlying_allocator_->Free(allocation); - } -} - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h deleted file mode 100644 index 08a7a06dbf290b55994a407fe478f792b0c0964a..0000000000000000000000000000000000000000 --- a/paddle/fluid/memory/allocation/zero_size_allocator.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/memory/allocation/allocator.h" - -namespace paddle { -namespace memory { -namespace allocation { - -// The allocator handles the request's size is zero. Allocator will always -// return an allocation even the request size is zero. However, the -// allocation.ptr() is nullptr -class ZeroSizeAllocator : public Allocator { - public: - ZeroSizeAllocator(std::shared_ptr underlying_allocator, - const platform::Place& p) - : underlying_allocator_(std::move(underlying_allocator)), place_(p) {} - - bool IsAllocThreadSafe() const override; - - protected: - Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; - void FreeImpl(Allocation* allocation) override; - - private: - std::shared_ptr underlying_allocator_; - const platform::Place& place_; -}; - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 1408163e4b5278ddcd65eb4f2900109d772a589a..c08d86eb213310b4e8dbac541c254867bb44b903 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include // for memcpy +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -24,6 +25,7 @@ template <> void Copy(platform::CPUPlace, void* dst, platform::CPUPlace, const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -40,6 +42,7 @@ template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); if (stream) { @@ -59,6 +62,8 @@ template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + platform::SetDeviceId(dst_place.device); if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); @@ -77,6 +82,8 @@ template <> void Copy( platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + if (dst_place == src_place) { platform::SetDeviceId(src_place.device); if (stream) { @@ -103,6 +110,7 @@ template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -110,6 +118,7 @@ template <> void Copy( platform::CUDAPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -117,6 +126,7 @@ template <> void Copy( platform::CUDAPinnedPlace dst_place, void* dst, platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -125,6 +135,7 @@ void Copy( platform::CUDAPinnedPlace dst_place, void* dst, platform::CUDAPlace src_place, const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); @@ -140,6 +151,8 @@ void Copy( platform::CUDAPlace dst_place, void* dst, platform::CUDAPinnedPlace src_place, const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + platform::SetDeviceId(dst_place.device); if (stream) { platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index d583909a666624d86031bb207154c93cf12d5cc2..f6295337d1f1042f021f7b0de15f476225beb3a2 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -454,6 +454,7 @@ class BatchNormGradKernel const auto *running_mean = ctx.Input("Mean"); const auto *running_variance = ctx.Input("Variance"); mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); ConstEigenVectorArrayMap var_arr(running_variance->data(), C); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 3d24e8986df97d4ad5ad1dfc92a2e0a7eed38521..e9a7201bc0826414ec4adbd3bf2804db013a4571 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -264,8 +264,8 @@ struct FindRangeAbsMaxFunctor { T* out_scale_data = out_scale->mutable_data(gpu_place); framework::Tensor need_find_max, out_size; - int* find_max = need_find_max.mutable_data(gpu_place); - int* out_size_data = out_size.mutable_data(gpu_place); + int* find_max = need_find_max.mutable_data({1}, gpu_place); + int* out_size_data = out_size.mutable_data({1}, gpu_place); FindRangeAbsMaxAndFillArray<<<1, 1, 0, ctx.stream()>>>( cur_scale.data(), last_scale.data(), iter.data(),