Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
573ab04c
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
692
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
573ab04c
编写于
8月 16, 2017
作者:
L
Luo Tao
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into warpctc
上级
79a336b7
efdb4aa2
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
119 addition
and
49 deletion
+119
-49
.clang_format.hook
.clang_format.hook
+15
-0
.pre-commit-config.yaml
.pre-commit-config.yaml
+2
-2
Dockerfile
Dockerfile
+0
-14
cmake/flags.cmake
cmake/flags.cmake
+1
-8
doc/design/mkldnn/README.MD
doc/design/mkldnn/README.MD
+1
-0
paddle/framework/CMakeLists.txt
paddle/framework/CMakeLists.txt
+1
-1
paddle/framework/backward.cc
paddle/framework/backward.cc
+17
-0
paddle/memory/CMakeLists.txt
paddle/memory/CMakeLists.txt
+1
-1
paddle/memory/memcpy.cc
paddle/memory/memcpy.cc
+0
-2
paddle/operators/recurrent_op.h
paddle/operators/recurrent_op.h
+2
-2
paddle/platform/CMakeLists.txt
paddle/platform/CMakeLists.txt
+4
-1
paddle/platform/device_context.cc
paddle/platform/device_context.cc
+65
-14
paddle/platform/device_context.h
paddle/platform/device_context.h
+9
-4
paddle/platform/device_context_test.cc
paddle/platform/device_context_test.cc
+1
-0
未找到文件。
.clang_format.hook
0 → 100755
浏览文件 @
573ab04c
#!/bin/bash
set
-e
readonly
VERSION
=
"3.8"
version
=
$(
clang-format
-version
)
if
!
[[
$version
==
*
"
$VERSION
"
*
]]
;
then
echo
"clang-format version check failed."
echo
"a version contains '
$VERSION
' is needed, but get '
$version
'"
echo
"you can install the right version, and make an soft-link to '
\$
PATH' env"
exit
-1
fi
clang-format
$@
.pre-commit-config.yaml
浏览文件 @
573ab04c
...
...
@@ -19,10 +19,10 @@
-
id
:
end-of-file-fixer
-
repo
:
local
hooks
:
-
id
:
clang-format
-
id
:
clang-format
-with-version-check
name
:
clang-format
description
:
Format files with ClangFormat.
entry
:
clang-format
-i
entry
:
./.clang_format.hook
-i
language
:
system
files
:
\.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
-
repo
:
https://github.com/PaddlePaddle/pre-commit-golang
...
...
Dockerfile
浏览文件 @
573ab04c
...
...
@@ -71,20 +71,6 @@ RUN pip install -r /root/requirements.txt
RUN
apt-get
install
-y
libssl-dev libffi-dev
RUN
pip
install
certifi urllib3[secure]
# TODO(qijun) The template library Eigen doesn't work well with GCC 5
# coming with the default Docker image, so we switch to use GCC 4.8
# by default. And I will check Eigen library later.
RUN
ln
-sf
gcc-4.8 /usr/bin/gcc
&&
\
ln
-sf
gcc-ar-4.8 /usr/bin/gcc-ar
&&
\
ln
-sf
gcc-nm-4.8 /usr/bin/gcc-nm
&&
\
ln
-sf
gcc-ranlib-4.8 /usr/bin/gcc-ranlib
&&
\
ln
-sf
gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc
&&
\
ln
-sf
gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar
&&
\
ln
-sf
gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm
&&
\
ln
-sf
gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib
&&
\
ln
-sf
g++-4.8 /usr/bin/g++
&&
\
ln
-sf
g++-4.8 /usr/bin/x86_64-linux-gnu-g++
# Install woboq_codebrowser to /woboq
RUN
git clone https://github.com/woboq/woboq_codebrowser /woboq
&&
\
...
...
cmake/flags.cmake
浏览文件 @
573ab04c
...
...
@@ -9,13 +9,6 @@ function(CheckCompilerCXX11Flag)
if
(
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_LESS 4.8
)
message
(
FATAL_ERROR
"Unsupported GCC version. GCC >= 4.8 required."
)
endif
()
if
(
NOT ANDROID
)
# TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
# Use Debug mode instead for now.
if
(
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9
)
set
(
CMAKE_BUILD_TYPE
"Debug"
CACHE STRING
""
FORCE
)
endif
()
endif
()
elseif
(
CMAKE_CXX_COMPILER_ID STREQUAL
"AppleClang"
OR CMAKE_CXX_COMPILER_ID STREQUAL
"Clang"
)
# cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
# Apple Clang is a different compiler than upstream Clang which havs different version numbers.
...
...
@@ -160,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
LIST
(
APPEND CUDA_NVCC_FLAGS -std=c++11
--default-stream per-thread
)
LIST
(
APPEND CUDA_NVCC_FLAGS -std=c++11
)
LIST
(
APPEND CUDA_NVCC_FLAGS --use_fast_math
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
...
...
doc/design/mkldnn/README.MD
浏览文件 @
573ab04c
...
...
@@ -101,6 +101,7 @@ if use_mkldnn
5.
在
**Argument**
里添加两个
`MkldnnMatrixPtr`
,取名为
`mkldnnValue`
和
`mkldnnGrad`
,用于存放
`MkldnnLayer`
会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
6.
在父类
`Layer`
中的
`getOutput`
函数中添加一段逻辑,用于判断
`deviceId`
,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用
`Argument`
的cvt函数把output统一到需要的device上。
7.
在原来的
`FLAGS`
中添加一个
`use_mkldnn`
的flag,用于选择是否使用MKL-DNN的相关功能。
8.
关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况,所以需要在保存参数时同时保存该格式信息。目前准备扩展
[
Header
](
https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247
)
里面的
`int32_t version`
。这个值不管是在v1还是在v2里面,一直保存的是0,所以可以充分利用这个信息,定义一个枚举处理所有MKLDNN的参数格式,从而
`MKLDNNLayer`
就可以从输入的参数中获取需要的格式信息。
## References
...
...
paddle/framework/CMakeLists.txt
浏览文件 @
573ab04c
...
...
@@ -38,7 +38,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
cc_library
(
backward SRCS backward.cc DEPS net_op
)
cc_test
(
backward_test SRCS backward_test.cc DEPS backward
)
cc_test
(
backward_test SRCS backward_test.cc DEPS backward
recurrent_op device_context
)
if
(
WITH_PYTHON
)
cc_library
(
paddle_pybind SHARED
...
...
paddle/framework/backward.cc
浏览文件 @
573ab04c
...
...
@@ -17,6 +17,7 @@
#include <list>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -178,6 +179,22 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
return
false
;
});
// process recurrent gradient op as a special operator.
if
(
forwardOp
.
Type
()
==
"recurrent_op"
)
{
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
// this will result in infinite loop.
const
auto
&
rnnop
=
*
static_cast
<
const
operators
::
RecurrentOp
*>
(
&
forwardOp
);
auto
rnn_grad_op
=
static_cast
<
operators
::
RecurrentGradientOp
*>
(
grad_op
.
get
());
const
auto
&
stepnet_op
=
*
static_cast
<
const
OperatorBase
*>
(
&
rnnop
.
stepnet
());
// create stepnet's gradient op
auto
grad_stepnet
=
BackwardRecursive
(
stepnet_op
,
no_grad_names
,
uniq_id
);
rnn_grad_op
->
set_stepnet
(
std
::
static_pointer_cast
<
operators
::
NetOp
>
(
grad_stepnet
));
}
if
(
net
->
ops_
.
empty
())
{
// Current no aux op is added to network
return
grad_op
;
}
...
...
paddle/memory/CMakeLists.txt
浏览文件 @
573ab04c
add_subdirectory
(
detail
)
cc_library
(
memory SRCS memory.cc
)
cc_library
(
memcpy SRCS memcpy.cc
DEPS device_context
)
cc_library
(
memcpy SRCS memcpy.cc
)
cc_library
(
paddle_memory
DEPS
...
...
paddle/memory/memcpy.cc
浏览文件 @
573ab04c
...
...
@@ -16,8 +16,6 @@ limitations under the License. */
#include <cstring> // for memcpy
#include "paddle/platform/device_context.h"
namespace
paddle
{
namespace
memory
{
...
...
paddle/operators/recurrent_op.h
浏览文件 @
573ab04c
...
...
@@ -127,7 +127,7 @@ class RecurrentOp final : public framework::OperatorBase {
}
void
set_stepnet
(
std
::
shared_ptr
<
NetOp
>
net
)
{
stepnet_
=
net
;
}
const
NetOp
*
stepnet
()
const
{
return
stepnet_
.
get
()
;
}
const
NetOp
&
stepnet
()
const
{
return
*
stepnet_
;
}
static
const
rnn
::
ArgumentName
kArgName
;
...
...
@@ -158,7 +158,7 @@ class RecurrentGradientOp final : public framework::OperatorBase {
static
const
rnn
::
ArgumentName
kArgName
;
void
set_stepnet
(
const
std
::
shared_ptr
<
NetOp
>&
net
)
{
stepnet_
=
net
;
}
const
NetOp
*
stepnet
()
const
{
return
stepnet_
.
get
()
;
}
const
NetOp
&
stepnet
()
const
{
return
*
stepnet_
;
}
private:
RecurrentGradientAlgorithm
alg_
;
...
...
paddle/platform/CMakeLists.txt
浏览文件 @
573ab04c
...
...
@@ -16,5 +16,8 @@ ELSE()
set
(
GPU_CTX_DEPS
)
ENDIF
()
cc_library
(
device_context SRCS device_context.cc DEPS place eigen3
${
GPU_CTX_DEPS
}
)
# memcpy deoends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library
(
device_context SRCS device_context.cc DEPS memory buddy_allocator
system_allocator memory_block meta_data meta_cache place eigen3
${
GPU_CTX_DEPS
}
)
nv_test
(
device_context_test SRCS device_context_test.cc DEPS device_context gpu_info
)
paddle/platform/device_context.cc
浏览文件 @
573ab04c
...
...
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/platform/device_context.h"
#include "paddle/memory/memory.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -36,6 +37,59 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
#ifndef PADDLE_ONLY_CPU
class
EigenCudaStreamDevice
:
public
Eigen
::
StreamInterface
{
public:
EigenCudaStreamDevice
()
:
scratch_
(
nullptr
),
semaphore_
(
nullptr
)
{
Eigen
::
initializeDeviceProp
();
}
~
EigenCudaStreamDevice
()
override
{}
void
Reinitialize
(
const
cudaStream_t
*
cuda_stream
,
GPUPlace
place
)
{
stream_
=
cuda_stream
;
place_
=
place
;
device_prop_
=
&
Eigen
::
m_deviceProperties
[
place
.
device
];
}
const
cudaStream_t
&
stream
()
const
override
{
return
*
stream_
;
}
const
cudaDeviceProp
&
deviceProperties
()
const
override
{
return
*
device_prop_
;
}
void
*
allocate
(
size_t
num_bytes
)
const
override
{
return
paddle
::
memory
::
Alloc
(
place_
,
num_bytes
);
}
void
deallocate
(
void
*
buffer
)
const
override
{
paddle
::
memory
::
Free
(
place_
,
buffer
);
}
void
*
scratchpad
()
const
override
{
if
(
scratch_
==
NULL
)
{
scratch_
=
allocate
(
Eigen
::
kCudaScratchSize
+
sizeof
(
unsigned
int
));
}
return
scratch_
;
}
unsigned
int
*
semaphore
()
const
override
{
if
(
semaphore_
==
NULL
)
{
char
*
scratch
=
static_cast
<
char
*>
(
scratchpad
())
+
Eigen
::
kCudaScratchSize
;
semaphore_
=
reinterpret_cast
<
unsigned
int
*>
(
scratch
);
PADDLE_ENFORCE
(
cudaMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
*
stream_
));
}
return
semaphore_
;
}
private:
GPUPlace
place_
;
const
cudaStream_t
*
stream_
;
// not owned;
const
cudaDeviceProp
*
device_prop_
;
// not owned;
mutable
void
*
scratch_
;
mutable
unsigned
int
*
semaphore_
;
};
template
<
>
Eigen
::
GpuDevice
*
DeviceContext
::
get_eigen_device
<
Eigen
::
GpuDevice
>
()
const
{
return
reinterpret_cast
<
const
CUDADeviceContext
*>
(
this
)
->
eigen_device
();
...
...
@@ -43,19 +97,9 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
CUDADeviceContext
::
CUDADeviceContext
(
GPUPlace
place
)
:
place_
(
place
)
{
SetDeviceId
(
place_
.
device
);
// TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
// here will cause segment fault. We must implement a class derived from
// Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
// later. Please refer to the implementation of class EigenCudaStreamDevice
// in TensorFlow.
//
// We find that CUDA 7 introduces a new option, the per-thread default stream,
// that has two effects. Please refer to https://devblogs.nvidia.com/
// parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
//
// So, we decide to use default stream and add –default-stream per-thread nvcc
// flag. Than, two threads with two CUDADeviceContexts will run parallelly.
eigen_stream_
.
reset
(
new
Eigen
::
CudaStreamDevice
());
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
eigen_device_
.
reset
(
new
Eigen
::
GpuDevice
(
eigen_stream_
.
get
()));
}
...
...
@@ -75,12 +119,13 @@ CUDADeviceContext::~CUDADeviceContext() {
}
eigen_stream_
.
reset
();
eigen_device_
.
reset
();
PADDLE_ENFORCE
(
cudaStreamDestroy
(
stream_
));
}
Place
CUDADeviceContext
::
GetPlace
()
const
{
return
place_
;
}
void
CUDADeviceContext
::
Wait
()
const
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
0
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
}
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
...
...
@@ -91,6 +136,7 @@ cublasHandle_t CUDADeviceContext::cublas_handle() {
if
(
!
cublas_handle_
)
{
SetDeviceId
(
place_
.
device
);
PADDLE_ENFORCE
(
dynload
::
cublasCreate
(
&
cublas_handle_
));
PADDLE_ENFORCE
(
dynload
::
cublasSetStream
(
cublas_handle_
,
stream_
));
}
return
cublas_handle_
;
}
...
...
@@ -99,10 +145,13 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
if
(
!
cudnn_handle_
)
{
SetDeviceId
(
place_
.
device
);
PADDLE_ENFORCE
(
dynload
::
cudnnCreate
(
&
cudnn_handle_
));
PADDLE_ENFORCE
(
dynload
::
cudnnSetStream
(
cudnn_handle_
,
stream_
));
}
return
cudnn_handle_
;
}
cudaStream_t
CUDADeviceContext
::
stream
()
{
return
stream_
;
}
curandGenerator_t
CUDADeviceContext
::
curand_generator
()
{
if
(
!
curand_generator_
)
{
SetDeviceId
(
place_
.
device
);
...
...
@@ -110,6 +159,8 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
CURAND_RNG_PSEUDO_DEFAULT
));
PADDLE_ENFORCE
(
dynload
::
curandSetPseudoRandomGeneratorSeed
(
curand_generator_
,
seed_
));
PADDLE_ENFORCE
(
dynload
::
curandSetStream
(
curand_generator_
,
stream_
));
}
return
curand_generator_
;
}
...
...
paddle/platform/device_context.h
浏览文件 @
573ab04c
...
...
@@ -52,6 +52,7 @@ class CPUDeviceContext : public DeviceContext {
};
#ifndef PADDLE_ONLY_CPU
class
EigenCudaStreamDevice
;
class
CUDADeviceContext
:
public
DeviceContext
{
public:
...
...
@@ -76,6 +77,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return curand handle in the device context. */
curandGenerator_t
curand_generator
();
/*! \brief Return cuda stream in the device context. */
cudaStream_t
stream
();
// clang-format on
private:
...
...
@@ -83,15 +87,16 @@ class CUDADeviceContext : public DeviceContext {
private:
std
::
unique_ptr
<
Eigen
::
GpuDevice
>
eigen_device_
;
std
::
unique_ptr
<
Eigen
::
CudaStreamDevice
>
eigen_stream_
;
std
::
unique_ptr
<
EigenCudaStreamDevice
>
eigen_stream_
;
private:
uint64_t
seed_
;
// clang-format off
cudnnHandle_t
cudnn_handle_
=
nullptr
;
cublasHandle_t
cublas_handle_
=
nullptr
;
curandGenerator_t
curand_generator_
=
nullptr
;
cudaStream_t
stream_
{
nullptr
};
cudnnHandle_t
cudnn_handle_
{
nullptr
};
cublasHandle_t
cublas_handle_
{
nullptr
};
curandGenerator_t
curand_generator_
{
nullptr
};
// clang-format on
};
...
...
paddle/platform/device_context_test.cc
浏览文件 @
573ab04c
...
...
@@ -45,6 +45,7 @@ TEST(Device, CUDADeviceContext) {
ASSERT_NE
(
nullptr
,
cublas_handle
);
curandGenerator_t
curand_handle
=
device_context
->
curand_generator
();
ASSERT_NE
(
nullptr
,
curand_handle
);
ASSERT_NE
(
nullptr
,
device_context
->
stream
());
delete
device_context
;
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录