malloc.cc 7.1 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

W
Wu Yi 已提交
15
#include <string>
G
gongweibao 已提交
16 17
#include <vector>

Y
Yi Wang 已提交
18
#include "paddle/fluid/memory/malloc.h"
19

L
liaogang 已提交
20 21
#include "glog/logging.h"

Y
Yi Wang 已提交
22 23 24
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
W
Wu Yi 已提交
25
#include "paddle/fluid/string/printf.h"
L
liaogang 已提交
26

27 28 29 30 31 32
DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
            "BuddyAllocator are always zeroed in some op's implementation. "
            "To find this error in time, we use init_allocated_mem to indicate "
            "that initializing the allocated memory with a small value "
            "during unit testing.");
L
liaogang 已提交
33
DECLARE_double(fraction_of_gpu_memory_to_use);
L
liaogang 已提交
34

35 36 37
namespace paddle {
namespace memory {

38 39 40
using BuddyAllocator = detail::BuddyAllocator;

BuddyAllocator* GetCPUBuddyAllocator() {
41 42
  // We tried thread_local for inference::RNN1 model, but that not works much
  // for multi-thread test.
G
gongweibao 已提交
43
  static std::once_flag init_flag;
44
  static detail::BuddyAllocator* a = nullptr;
G
gongweibao 已提交
45 46 47 48 49 50 51

  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
  });

52
  return a;
L
liaogang 已提交
53 54
}

55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
// seems they are almost the same overhead.
struct NaiveAllocator {
  void* Alloc(size_t size) { return malloc(size); }

  void Free(void* p) {
    PADDLE_ENFORCE(p);
    free(p);
  }

  static NaiveAllocator* Instance() {
    static NaiveAllocator x;
    return &x;
  }

 private:
  std::mutex lock_;
};

L
liaogang 已提交
74
template <>
C
chengduoZH 已提交
75
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
76
  VLOG(100) << "Allocate " << size << " bytes on " << platform::Place(place);
77
  void* p = GetCPUBuddyAllocator()->Alloc(size);
78 79 80
  if (FLAGS_init_allocated_mem) {
    memset(p, 0xEF, size);
  }
81
  VLOG(100) << "  pointer=" << p;
82
  return p;
L
liaogang 已提交
83 84 85
}

template <>
C
chengduoZH 已提交
86
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
87
  VLOG(100) << "Free pointer=" << p << " on " << platform::Place(place);
L
liaogang 已提交
88 89 90 91 92 93 94 95
  GetCPUBuddyAllocator()->Free(p);
}

template <>
size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
  return GetCPUBuddyAllocator()->Used();
}

96
#ifdef PADDLE_WITH_CUDA
L
liaogang 已提交
97

98
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
G
gongweibao 已提交
99 100 101 102
  static std::once_flag init_flag;
  static detail::BuddyAllocator** a_arr = nullptr;

  std::call_once(init_flag, [gpu_id]() {
103
    int gpu_num = platform::GetCUDADeviceCount();
G
gongweibao 已提交
104 105 106 107 108 109 110 111 112 113 114
    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
                   gpu_num);

    a_arr = new BuddyAllocator*[gpu_num];
    for (int i = 0; i < gpu_num; i++) {
      a_arr[i] = nullptr;
      platform::SetDeviceId(i);
      a_arr[i] = new BuddyAllocator(
          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());

115 116 117 118 119 120
      VLOG(100) << "\n\nNOTE: each GPU device use "
                << FLAGS_fraction_of_gpu_memory_to_use * 100
                << "% of GPU memory.\n"
                << "You can set GFlags environment variable '"
                << "FLAGS_fraction_of_gpu_memory_to_use"
                << "' to change the fraction of GPU usage.\n\n";
L
liaogang 已提交
121
    }
G
gongweibao 已提交
122 123
  });

124
  platform::SetDeviceId(gpu_id);
G
gongweibao 已提交
125
  return a_arr[gpu_id];
L
liaogang 已提交
126 127
}

L
liaogang 已提交
128
template <>
D
dzhwinter 已提交
129
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
130
  return GetGPUBuddyAllocator(place.device)->Used();
L
liaogang 已提交
131
}
L
liaogang 已提交
132

L
liaogang 已提交
133
template <>
C
chengduoZH 已提交
134 135
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
C
chengduoZH 已提交
136
  auto* ptr = buddy_allocator->Alloc(size);
137 138 139 140
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
    size_t avail, total;
141
    platform::GpuMemoryUsage(&avail, &total);
W
Wu Yi 已提交
142 143 144
    LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
                 << " in GPU " << place.device << ", available "
                 << string::HumanReadableSize(avail);
145
    LOG(WARNING) << "total " << total;
W
Wu Yi 已提交
146 147 148 149 150 151 152 153
    LOG(WARNING) << "GpuMinChunkSize "
                 << string::HumanReadableSize(
                        buddy_allocator->GetMinChunkSize());
    LOG(WARNING) << "GpuMaxChunkSize "
                 << string::HumanReadableSize(
                        buddy_allocator->GetMaxChunkSize());
    LOG(WARNING) << "GPU memory used: "
                 << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
154 155
    platform::SetDeviceId(cur_dev);
  }
156 157 158
  if (FLAGS_init_allocated_mem) {
    cudaMemset(ptr, 0xEF, size);
  }
159
  return ptr;
160 161
}

L
liaogang 已提交
162
template <>
C
chengduoZH 已提交
163 164 165 166
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
  GetGPUBuddyAllocator(place.device)->Free(p);
}

C
chengduoZH 已提交
167
BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
G
gongweibao 已提交
168 169 170 171 172 173
  static std::once_flag init_flag;
  static BuddyAllocator* ba = nullptr;

  std::call_once(init_flag, []() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::CUDAPinnedAllocator),
174 175
                            platform::CUDAPinnedMinChunkSize(),
                            platform::CUDAPinnedMaxChunkSize());
G
gongweibao 已提交
176 177
  });

C
chengduoZH 已提交
178 179 180 181
  return ba;
}

template <>
C
chengduoZH 已提交
182
size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
C
chengduoZH 已提交
183
  return GetCUDAPinnedBuddyAllocator()->Used();
C
chengduoZH 已提交
184 185 186 187 188
}

template <>
void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
                                       size_t size) {
C
chengduoZH 已提交
189
  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
C
chengduoZH 已提交
190 191
  void* ptr = buddy_allocator->Alloc(size);

C
chengduoZH 已提交
192 193 194 195
  if (ptr == nullptr) {
    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                 << " bytes in CUDAPinnedPlace";
  }
196 197 198
  if (FLAGS_init_allocated_mem) {
    memset(ptr, 0xEF, size);
  }
C
chengduoZH 已提交
199 200 201 202 203
  return ptr;
}

template <>
void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
C
chengduoZH 已提交
204
  GetCUDAPinnedBuddyAllocator()->Free(p);
205
}
L
Luo Tao 已提交
206
#endif
207

208 209 210 211 212 213 214 215 216 217 218 219
size_t Usage::operator()(const platform::CPUPlace& cpu) const {
  return Used(cpu);
}

size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
  return Used(gpu);
#else
  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}

C
chengduoZH 已提交
220
size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
C
chengduoZH 已提交
221
#ifdef PADDLE_WITH_CUDA
C
chengduoZH 已提交
222
  return Used(cuda_pinned);
C
chengduoZH 已提交
223 224 225
#else
  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
C
chengduoZH 已提交
226 227
}

228 229 230 231
size_t memory_usage(const platform::Place& p) {
  return boost::apply_visitor(Usage(), p);
}

232 233
}  // namespace memory
}  // namespace paddle