Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
09d9794c
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
09d9794c
编写于
6月 27, 2017
作者:
L
liaogang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'wangkuiyi/memory_cpu_allocator' into cpu_mem
上级
f3294541
e14e6873
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
242 addition
and
162 deletion
+242
-162
paddle/gserver/gradientmachines/MultiGradientMachine.cpp
paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+15
-7
paddle/gserver/gradientmachines/MultiGradientMachine.h
paddle/gserver/gradientmachines/MultiGradientMachine.h
+4
-2
paddle/memory/CMakeLists.txt
paddle/memory/CMakeLists.txt
+6
-0
paddle/memory/detail/CMakeLists.txt
paddle/memory/detail/CMakeLists.txt
+5
-2
paddle/memory/detail/cpu_allocator.h
paddle/memory/detail/cpu_allocator.h
+0
-71
paddle/memory/detail/cpu_allocator_test.cc
paddle/memory/detail/cpu_allocator_test.cc
+0
-30
paddle/memory/detail/system_allocator.h
paddle/memory/detail/system_allocator.h
+129
-0
paddle/memory/detail/system_allocator_test.cc
paddle/memory/detail/system_allocator_test.cc
+50
-0
paddle/memory/memory.cc
paddle/memory/memory.cc
+30
-37
paddle/memory/memory.h
paddle/memory/memory.h
+3
-13
未找到文件。
paddle/gserver/gradientmachines/MultiGradientMachine.cpp
浏览文件 @
09d9794c
...
...
@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
outArgStream_
=
HPPL_STREAM_1
;
start
();
}
void
MultiGradientMachine
::
start
()
{
for
(
auto
&
thread
:
threads_
)
{
thread
->
start
();
}
}
void
MultiGradientMachine
::
finish
()
{
for
(
auto
&
thread
:
threads_
)
{
thread
->
stop
();
}
}
std
::
vector
<
const
std
::
vector
<
ParameterPtr
>*>
MultiGradientMachine
::
getSlaveParameters
()
{
std
::
vector
<
const
std
::
vector
<
ParameterPtr
>*>
vec
;
...
...
@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() {
}
}
void
MultiGradientMachine
::
finish
()
{
for
(
auto
&
thread
:
threads_
)
{
thread
->
stop
();
}
}
Evaluator
*
MultiGradientMachine
::
makeEvaluator
()
const
{
return
threads_
[
0
]
->
getGradientMachine
()
->
makeEvaluator
();
}
...
...
@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,
gradStream_
=
HPPL_STREAM_2
;
valueStream_
=
HPPL_STREAM_3
;
stopping_
=
fals
e
;
stopping_
=
tru
e
;
updateCounter_
=
0
;
parameterUpdated_
=
false
;
}
...
...
@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config,
TrainerThread
::~
TrainerThread
()
{
stop
();
}
void
TrainerThread
::
start
()
{
if
(
!
stopping_
)
return
;
stopping_
=
false
;
gradientMachine_
->
start
();
computeThread_
.
reset
(
new
std
::
thread
([
this
]()
{
computeThread
();
}));
...
...
paddle/gserver/gradientmachines/MultiGradientMachine.h
浏览文件 @
09d9794c
...
...
@@ -176,6 +176,10 @@ public:
explicit
MultiGradientMachine
(
const
ModelConfig
&
config
,
bool
useGpu
);
virtual
void
start
();
virtual
void
finish
();
virtual
void
prefetch
(
const
std
::
vector
<
Argument
>&
inArgs
);
virtual
void
forward
(
const
std
::
vector
<
Argument
>&
inArgs
,
...
...
@@ -193,8 +197,6 @@ public:
virtual
void
onPassEnd
();
virtual
void
finish
();
virtual
Evaluator
*
makeEvaluator
()
const
;
virtual
void
eval
(
Evaluator
*
evaluator
)
const
;
...
...
paddle/memory/CMakeLists.txt
浏览文件 @
09d9794c
add_subdirectory
(
detail
)
if
(
${
WITH_GPU
}
)
nv_library
(
memory SRCS memory.cc
)
else
(
${
WITH_GPU
}
)
cc_library
(
memory SRCS memroy.cc
)
endif
(
${
WITH_GPU
}
)
paddle/memory/detail/CMakeLists.txt
浏览文件 @
09d9794c
cc_test
(
cpu_allocator_test SRCS cpu_allocator_test.cc
)
nv_test
(
gpu_allocator_test SRCS gpu_allocator_test.cc
)
if
(
${
WITH_GPU
}
)
nv_test
(
system_allocator_test SRCS system_allocator_test.cc
)
else
(
${
WITH_GPU
}
)
cc_test
(
system_allocator_test SRCS system_allocator_test.cc
)
endif
(
${
WITH_GPU
}
)
paddle/memory/detail/cpu_allocator.h
已删除
100644 → 0
浏览文件 @
f3294541
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <cstdlib> // for malloc and free
#ifndef _WIN32
#include <sys/mman.h> // for mlock and munlock
#endif
namespace
paddle
{
namespace
memory
{
namespace
detail
{
// CPUAllocator<staging=true> calls mlock, which returns
// pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the
// amount of memory available to the system for paging. So, by
// default, we should use CPUAllocator<staging=false>.
template
<
bool
staging
>
class
CPUAllocator
{
public:
void
*
Alloc
(
size_t
size
);
void
Free
(
void
*
p
,
size_t
size
);
};
template
<
>
class
CPUAllocator
<
false
>
{
public:
void
*
Alloc
(
size_t
size
)
{
return
std
::
malloc
(
size
);
}
void
Free
(
void
*
p
,
size_t
size
)
{
std
::
free
(
p
);
}
};
template
<
>
class
CPUAllocator
<
true
>
{
public:
void
*
Alloc
(
size_t
size
)
{
void
*
p
=
std
::
malloc
(
size
);
if
(
p
==
nullptr
)
{
return
p
;
}
#ifndef _WIN32
mlock
(
p
,
size
);
#endif
return
p
;
}
void
Free
(
void
*
p
,
size_t
size
)
{
#ifndef _WIN32
munlock
(
p
,
size
);
#endif
std
::
free
(
p
);
}
};
}
// namespace detail
}
// namespace memory
}
// namespace paddle
paddle/memory/detail/cpu_allocator_test.cc
已删除
100644 → 0
浏览文件 @
f3294541
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/detail/cpu_allocator.h"
#include "gtest/gtest.h"
TEST
(
CPUAllocator
,
NonStaging
)
{
paddle
::
memory
::
detail
::
CPUAllocator
<
false
>
a
;
void
*
p
=
a
.
Alloc
(
4096
);
EXPECT_NE
(
p
,
nullptr
);
a
.
Free
(
p
,
4096
);
}
TEST
(
CPUAllocator
,
Staging
)
{
paddle
::
memory
::
detail
::
CPUAllocator
<
true
>
a
;
void
*
p
=
a
.
Alloc
(
4096
);
EXPECT_NE
(
p
,
nullptr
);
a
.
Free
(
p
,
4096
);
}
paddle/memory/detail/
gpu
_allocator.h
→
paddle/memory/detail/
system
_allocator.h
浏览文件 @
09d9794c
...
...
@@ -14,79 +14,116 @@ limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <stddef.h> // for size_t
#include <sys/mman.h> // for mlock and munlock
#include <cstdlib> // for malloc and free
#ifndef PADDLE_ONLY_CPU
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#endif // PADDLE_ONLY_CPU
#include "paddle/platform/assert.h"
namespace
paddle
{
namespace
memory
{
namespace
detail
{
class
CPUDeleter
{
public:
CPUDeleter
(
void
*
ptr
,
size_t
size
,
bool
locked
)
:
ptr_
(
ptr
),
size_
(
size
),
locked_
(
locked
)
{}
void
*
Ptr
()
{
return
ptr_
;
}
void
operator
()(
void
*
ptr
)
{
PADDLE_ASSERT
(
ptr
==
ptr_
);
if
(
ptr_
!=
nullptr
&&
locked_
)
{
munlock
(
ptr_
,
size_
);
}
std
::
free
(
ptr_
);
}
private:
void
*
ptr_
;
size_t
size_
;
bool
locked_
;
};
// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
// and locked memory as staging areas for data exchange between host
// and device. Allocates too much would reduce the amount of memory
// available to the system for paging. So, by default, we should use
// CPUAllocator<staging=false>.
template
<
bool
lock_memory
>
class
CPUAllocator
{
public:
static
CPUDeleter
Alloc
(
size_t
size
)
{
void
*
p
=
std
::
malloc
(
size
);
if
(
p
!=
nullptr
&&
lock_memory
)
{
mlock
(
p
,
size
);
}
return
CPUDeleter
(
p
,
size
,
lock_memory
);
}
};
#ifndef PADDLE_ONLY_CPU // The following code are for CUDA.
namespace
{
inline
void
throw_on_error
(
cudaError_t
e
,
const
char
*
message
)
{
if
(
e
)
{
throw
thrust
::
system_error
(
e
,
thrust
::
cuda_category
(),
message
);
}
}
}
// namespace
// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
// pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the
// amount of memory available to the system for paging. So, by
// default, we should use GPUAllocator<staging=false>.
template
<
bool
staging
>
class
GPUAllocator
{
class
GPUDeleter
{
public:
void
*
Alloc
(
size_t
size
);
void
Free
(
void
*
p
,
size_t
size
);
};
GPUDeleter
(
void
*
ptr
,
size_t
size
,
bool
staging
)
:
ptr_
(
ptr
),
size_
(
size
),
staging_
(
staging
)
{}
template
<
>
class
GPUAllocator
<
false
>
{
public:
void
*
Alloc
(
size_t
size
)
{
void
*
p
=
0
;
cudaError_t
result
=
cudaMalloc
(
&
p
,
size
);
if
(
result
==
cudaSuccess
)
{
return
p
;
}
// clear last error
cudaGetLastError
();
return
nullptr
;
}
void
*
Ptr
()
{
return
ptr_
;
}
void
Free
(
void
*
p
,
size_t
size
)
{
void
operator
()(
void
*
ptr
)
{
PADDLE_ASSERT
(
ptr
==
ptr_
);
// Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFree after the
// driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if
// cudaFree succeeds.
auto
err
=
cudaFree
(
p
);
cudaError_t
err
=
staging_
?
cudaFreeHost
(
ptr
)
:
cudaFree
(
ptr
);
if
(
err
!=
cudaErrorCudartUnloading
)
{
throw_on_error
(
err
,
"cudaFree failed"
);
throw_on_error
(
err
,
"cudaFree
{Host}
failed"
);
}
}
private:
void
*
ptr_
;
size_t
size_
;
bool
staging_
;
};
template
<
>
class
GPUAllocator
<
true
>
{
// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
// pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the
// amount of memory available to the system for paging. So, by
// default, we should use GPUAllocator<staging=false>.
template
<
bool
staging
>
class
GPUAllocator
{
public:
void
*
Alloc
(
size_t
size
)
{
static
GPUDeleter
Alloc
(
size_t
size
)
{
void
*
p
=
0
;
cudaError_t
result
=
cudaMallocHost
(
&
p
,
size
);
if
(
result
==
cudaSuccess
)
{
return
p
;
cudaError_t
result
=
staging
?
cudaMallocHost
(
&
p
,
size
)
:
cudaMalloc
(
&
p
,
size
);
if
(
result
!=
cudaSuccess
)
{
cudaGetLastError
();
// clear error if there is any.
}
// clear last error
cudaGetLastError
();
return
nullptr
;
}
void
Free
(
void
*
p
,
size_t
size
)
{
throw_on_error
(
cudaFreeHost
(
p
),
"cudaFreeHost failed"
);
return
GPUDeleter
(
result
==
cudaSuccess
?
p
:
nullptr
,
size
,
staging
);
}
};
#endif // PADDLE_ONLY_CPU
}
// namespace detail
}
// namespace memory
}
// namespace paddle
paddle/memory/detail/
gpu
_allocator_test.cc
→
paddle/memory/detail/
system
_allocator_test.cc
浏览文件 @
09d9794c
...
...
@@ -12,19 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/detail/gpu_allocator.h"
#include "paddle/memory/detail/system_allocator.h"
#include <memory>
#include <vector>
#include "gtest/gtest.h"
TEST
(
GPUAllocator
,
NonStaging
)
{
paddle
::
memory
::
detail
::
GPUAllocator
<
false
>
a
;
void
*
p
=
a
.
Alloc
(
4096
);
EXPECT_NE
(
p
,
nullptr
);
a
.
Free
(
p
,
4096
);
template
<
typename
Allocator
>
void
TestAllocator
()
{
{
auto
d
=
Allocator
::
Alloc
(
sizeof
(
int
));
EXPECT_NE
(
d
.
Ptr
(),
nullptr
);
std
::
unique_ptr
<
int
>
p
(
static_cast
<
int
*>
(
d
.
Ptr
()),
d
);
}
{
auto
d
=
Allocator
::
Alloc
(
0
);
EXPECT_EQ
(
d
.
Ptr
(),
nullptr
);
std
::
unique_ptr
<
int
>
p
(
static_cast
<
int
*>
(
d
.
Ptr
()),
d
);
}
}
TEST
(
CPUAllocator
,
NoLockMem
)
{
TestAllocator
<
paddle
::
memory
::
detail
::
CPUAllocator
<
false
>>
();
}
TEST
(
CPUAllocator
,
LockMem
)
{
TestAllocator
<
paddle
::
memory
::
detail
::
CPUAllocator
<
true
>>
();
}
#ifndef PADDLE_ONLY_CPU
TEST
(
GPUAllocator
,
NoStaging
)
{
TestAllocator
<
paddle
::
memory
::
detail
::
GPUAllocator
<
false
>>
();
}
TEST
(
GPUAllocator
,
Staging
)
{
paddle
::
memory
::
detail
::
GPUAllocator
<
true
>
a
;
void
*
p
=
a
.
Alloc
(
4096
);
EXPECT_NE
(
p
,
nullptr
);
a
.
Free
(
p
,
4096
);
TestAllocator
<
paddle
::
memory
::
detail
::
GPUAllocator
<
true
>>
();
}
#endif // PADDLE_ONLY_CPU
paddle/memory/memory.cc
浏览文件 @
09d9794c
...
...
@@ -14,48 +14,41 @@ limitations under the License. */
#include "paddle/memory/memory.h"
#include "paddle/memory/detail/cpu_allocator.h"
#include "paddle/memory/detail/gpu_allocator.h"
namespace
paddle
{
namespace
memory
{
template
<
>
void
*
Alloc
<
CPUPlace
>
(
CPUPlace
,
size_t
size
)
{
return
GetCPUBuddyAllocator
(
false
/*non-staging*/
)
->
Alloc
(
size
);
}
void
*
AllocStaging
(
CPUPlace
,
size_t
size
)
{
return
GetCPUBuddyAllocator
(
true
/*staging*/
)
->
Alloc
(
size
);
}
template
<
>
void
*
Alloc
<
GPUPlace
>
(
GPUPlace
pl
,
size_t
size
)
{
return
GetGPUBuddyAllocator
(
pl
.
device
)
->
Alloc
(
size
);
}
template
<
>
void
Free
<
CPUPlace
>
(
CPUPlace
,
void
*
p
)
{
return
GetCPUBuddyAllocator
(
false
/*non-staging*/
)
->
Free
(
p
);
}
void
FreeStaging
(
CPUPlace
,
void
*
p
)
{
return
GetCPUBuddyAllocator
(
false
/*non-staging*/
)
->
Free
(
p
);
}
#ifdef PADDLE_WITH_GPU
template
<
>
void
*
Alloc
<
GPUPlace
>
(
GPUPlace
pl
,
void
*
p
)
{
return
GetGPUBuddyAllocator
(
pl
.
device
)
->
Free
(
p
);
}
template
<
>
size_t
Used
<
CPUPlace
>
(
CPUPlace
)
{
void
Alloc
(
paddle
::
platform
::
Place
pl
,
size_t
size
)
{
#ifndef PADDLE_ONLY_CPU
if
(
paddle
::
platform
::
is_gpu_place
(
pl
))
{
return
GetGPUBuddyAllocator
(
pl
.
device
)
->
Alloc
(
size
);
}
#endif // PADDLE_ONLY_CPU
PADDLE_ASSERT
(
paddle
::
platform
::
is_cpu_place
(
pl
));
return
GetCPUBuddyAllocator
()
->
Alloc
(
size
);
}
void
Free
(
paddle
::
platform
::
Place
pl
,
void
*
p
)
{
#ifndef PADDLE_ONLY_CPU
if
(
paddle
::
platform
::
is_gpu_place
(
pl
))
{
GetGPUBuddyAllocator
(
pl
.
device
)
->
Free
(
p
);
}
#endif // PADDLE_ONLY_CPU
PADDLE_ASSERT
(
paddle
::
platform
::
is_cpu_place
(
pl
));
GetCPUBuddyAllocator
()
->
Free
(
p
);
}
size_t
Used
(
paddle
::
platform
::
Place
pl
)
{
#ifndef PADDLE_ONLY_CPU
if
(
paddle
::
platform
::
is_gpu_place
(
pl
))
{
return
GetGPUBuddyAllocator
(
pl
.
device
)
->
Used
();
}
#endif // PADDLE_ONLY_CPU
PADDLE_ASSERT
(
paddle
::
platform
::
is_cpu_place
(
pl
));
return
GetCPUBuddyAllocator
()
->
Used
();
}
template
<
>
size_t
Alloc
<
GPUPlace
>
(
GPUPlace
pl
)
{
return
GetGPUBuddyAllocator
(
pl
.
device
)
->
Used
();
}
#endif // PADDLE_WITH_GPU
}
// namespace memory
}
// namespace paddle
paddle/memory/memory.h
浏览文件 @
09d9794c
...
...
@@ -19,19 +19,9 @@ limitations under the License. */
namespace
paddle
{
namespace
memory
{
template
<
typename
paddle
::
framework
::
Place
>
void
*
Alloc
(
Place
,
size_t
);
template
<
typename
paddle
::
framework
::
Place
>
void
Free
(
Place
,
void
*
);
template
<
typename
paddle
::
framework
::
Place
>
size_t
Used
(
Place
);
// Staging memory means "pinned" host memory that can be mapped into
// the CUDA memory space and accessed by the device rapidly. Don't
// allocate too much staging memory; otherwise system performance will
// degrade because the OS cannot find enough swap memory space.
void
*
AllocStaging
(
CPUPlace
,
size_t
);
void
*
FreeStaging
(
CPUPlace
,
size_t
);
void
*
Alloc
(
paddle
::
framework
::
Place
,
size_t
);
void
Free
(
paddle
::
framework
::
Place
,
void
*
);
size_t
Used
(
paddle
::
framework
::
Place
);
}
// namespace memory
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录