malloc.cc 6.8 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

G
gongweibao 已提交
15 16
#include <vector>

Y
Yi Wang 已提交
17
#include "paddle/fluid/memory/malloc.h"
18

L
liaogang 已提交
19 20
#include "glog/logging.h"

Y
Yi Wang 已提交
21 22 23
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
L
liaogang 已提交
24

25 26 27 28 29 30
DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
            "BuddyAllocator are always zeroed in some op's implementation. "
            "To find this error in time, we use init_allocated_mem to indicate "
            "that initializing the allocated memory with a small value "
            "during unit testing.");
L
liaogang 已提交
31
DECLARE_double(fraction_of_gpu_memory_to_use);
L
liaogang 已提交
32

33 34 35
namespace paddle {
namespace memory {

36 37 38
using BuddyAllocator = detail::BuddyAllocator;

BuddyAllocator* GetCPUBuddyAllocator() {
39 40
  // We tried thread_local for inference::RNN1 model, but that not works much
  // for multi-thread test.
G
gongweibao 已提交
41
  static std::once_flag init_flag;
42
  static detail::BuddyAllocator* a = nullptr;
G
gongweibao 已提交
43 44 45 46 47 48 49

  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
  });

50
  return a;
L
liaogang 已提交
51 52
}

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
// seems they are almost the same overhead.
struct NaiveAllocator {
  void* Alloc(size_t size) { return malloc(size); }

  void Free(void* p) {
    PADDLE_ENFORCE(p);
    free(p);
  }

  static NaiveAllocator* Instance() {
    static NaiveAllocator x;
    return &x;
  }

 private:
  std::mutex lock_;
};

L
liaogang 已提交
72
template <>
C
chengduoZH 已提交
73
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
74
  VLOG(100) << "Allocate " << size << " bytes on " << platform::Place(place);
75
  void* p = GetCPUBuddyAllocator()->Alloc(size);
76 77 78
  if (FLAGS_init_allocated_mem) {
    memset(p, 0xEF, size);
  }
79
  VLOG(100) << "  pointer=" << p;
80
  return p;
L
liaogang 已提交
81 82 83
}

template <>
C
chengduoZH 已提交
84
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
85
  VLOG(100) << "Free pointer=" << p << " on " << platform::Place(place);
L
liaogang 已提交
86 87 88 89 90 91 92 93
  GetCPUBuddyAllocator()->Free(p);
}

template <>
size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
  return GetCPUBuddyAllocator()->Used();
}

94
#ifdef PADDLE_WITH_CUDA
L
liaogang 已提交
95

96
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
G
gongweibao 已提交
97 98 99 100
  static std::once_flag init_flag;
  static detail::BuddyAllocator** a_arr = nullptr;

  std::call_once(init_flag, [gpu_id]() {
101
    int gpu_num = platform::GetCUDADeviceCount();
G
gongweibao 已提交
102 103 104 105 106 107 108 109 110 111 112
    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
                   gpu_num);

    a_arr = new BuddyAllocator*[gpu_num];
    for (int i = 0; i < gpu_num; i++) {
      a_arr[i] = nullptr;
      platform::SetDeviceId(i);
      a_arr[i] = new BuddyAllocator(
          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());

113 114 115 116 117 118
      VLOG(100) << "\n\nNOTE: each GPU device use "
                << FLAGS_fraction_of_gpu_memory_to_use * 100
                << "% of GPU memory.\n"
                << "You can set GFlags environment variable '"
                << "FLAGS_fraction_of_gpu_memory_to_use"
                << "' to change the fraction of GPU usage.\n\n";
L
liaogang 已提交
119
    }
G
gongweibao 已提交
120 121
  });

122
  platform::SetDeviceId(gpu_id);
G
gongweibao 已提交
123
  return a_arr[gpu_id];
L
liaogang 已提交
124 125
}

L
liaogang 已提交
126
template <>
D
dzhwinter 已提交
127
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
128
  return GetGPUBuddyAllocator(place.device)->Used();
L
liaogang 已提交
129
}
L
liaogang 已提交
130

L
liaogang 已提交
131
template <>
C
chengduoZH 已提交
132 133
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
C
chengduoZH 已提交
134
  auto* ptr = buddy_allocator->Alloc(size);
135 136 137 138
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
    size_t avail, total;
139
    platform::GpuMemoryUsage(&avail, &total);
140 141 142
    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
                 << place.device << ", available " << avail << " bytes";
    LOG(WARNING) << "total " << total;
143 144
    LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
    LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
D
dzhwinter 已提交
145
    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
146 147
    platform::SetDeviceId(cur_dev);
  }
148 149 150
  if (FLAGS_init_allocated_mem) {
    cudaMemset(ptr, 0xEF, size);
  }
151
  return ptr;
152 153
}

L
liaogang 已提交
154
template <>
C
chengduoZH 已提交
155 156 157 158
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
  GetGPUBuddyAllocator(place.device)->Free(p);
}

C
chengduoZH 已提交
159
BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
G
gongweibao 已提交
160 161 162 163 164 165
  static std::once_flag init_flag;
  static BuddyAllocator* ba = nullptr;

  std::call_once(init_flag, []() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::CUDAPinnedAllocator),
166 167
                            platform::CUDAPinnedMinChunkSize(),
                            platform::CUDAPinnedMaxChunkSize());
G
gongweibao 已提交
168 169
  });

C
chengduoZH 已提交
170 171 172 173
  return ba;
}

template <>
C
chengduoZH 已提交
174
size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
C
chengduoZH 已提交
175
  return GetCUDAPinnedBuddyAllocator()->Used();
C
chengduoZH 已提交
176 177 178 179 180
}

template <>
void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
                                       size_t size) {
C
chengduoZH 已提交
181
  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
C
chengduoZH 已提交
182 183
  void* ptr = buddy_allocator->Alloc(size);

C
chengduoZH 已提交
184 185 186 187
  if (ptr == nullptr) {
    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                 << " bytes in CUDAPinnedPlace";
  }
188 189 190
  if (FLAGS_init_allocated_mem) {
    memset(ptr, 0xEF, size);
  }
C
chengduoZH 已提交
191 192 193 194 195
  return ptr;
}

template <>
void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
C
chengduoZH 已提交
196
  GetCUDAPinnedBuddyAllocator()->Free(p);
197
}
L
Luo Tao 已提交
198
#endif
199

200 201 202 203 204 205 206 207 208 209 210 211
size_t Usage::operator()(const platform::CPUPlace& cpu) const {
  return Used(cpu);
}

size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
  return Used(gpu);
#else
  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}

C
chengduoZH 已提交
212
size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
C
chengduoZH 已提交
213
#ifdef PADDLE_WITH_CUDA
C
chengduoZH 已提交
214
  return Used(cuda_pinned);
C
chengduoZH 已提交
215 216 217
#else
  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
C
chengduoZH 已提交
218 219
}

220 221 222 223
size_t memory_usage(const platform::Place& p) {
  return boost::apply_visitor(Usage(), p);
}

224 225
}  // namespace memory
}  // namespace paddle