Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
48324c32
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
48324c32
编写于
12月 29, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
merge develop
test=develop
上级
8a83d699
10bedbde
变更
44
展开全部
隐藏空白更改
内联
并排
Showing
44 changed file
with
1416 addition
and
485 deletion
+1416
-485
cmake/cuda.cmake
cmake/cuda.cmake
+3
-0
cmake/cudnn.cmake
cmake/cudnn.cmake
+1
-0
cmake/external/cub.cmake
cmake/external/cub.cmake
+1
-1
cmake/external/dlpack.cmake
cmake/external/dlpack.cmake
+1
-1
cmake/operators.cmake
cmake/operators.cmake
+1
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+4
-4
paddle/fluid/framework/async_executor.cc
paddle/fluid/framework/async_executor.cc
+7
-2
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+1
-1
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
...id/framework/details/scope_buffered_ssa_graph_executor.cc
+13
-7
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
...uid/framework/details/scope_buffered_ssa_graph_executor.h
+8
-0
paddle/fluid/framework/executor_thread_worker.cc
paddle/fluid/framework/executor_thread_worker.cc
+53
-0
paddle/fluid/framework/executor_thread_worker.h
paddle/fluid/framework/executor_thread_worker.h
+2
-0
paddle/fluid/framework/rw_lock.h
paddle/fluid/framework/rw_lock.h
+35
-68
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+31
-20
paddle/fluid/framework/scope.h
paddle/fluid/framework/scope.h
+17
-3
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+34
-0
paddle/fluid/operators/conv_fusion_op.cc
paddle/fluid/operators/conv_fusion_op.cc
+61
-1
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+72
-31
paddle/fluid/operators/distributed/collective_server_test.cc
paddle/fluid/operators/distributed/collective_server_test.cc
+3
-2
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+3
-1
paddle/fluid/operators/fused/fusion_conv_inception_op.cc
paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+110
-0
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+272
-0
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+3
-0
paddle/fluid/platform/dynload/cudnn.cc
paddle/fluid/platform/dynload/cudnn.cc
+4
-0
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+12
-0
paddle/fluid/platform/timer.cc
paddle/fluid/platform/timer.cc
+63
-0
paddle/fluid/platform/timer.h
paddle/fluid/platform/timer.h
+61
-0
paddle/fluid/platform/timer_test.cc
paddle/fluid/platform/timer_test.cc
+45
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+7
-3
paddle/testing/paddle_gtest_main.cc
paddle/testing/paddle_gtest_main.cc
+43
-10
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+12
-3
python/paddle/fluid/data_feeder.py
python/paddle/fluid/data_feeder.py
+1
-2
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+10
-14
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+5
-4
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+62
-58
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+5
-6
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+268
-213
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+8
-3
python/paddle/fluid/metrics.py
python/paddle/fluid/metrics.py
+14
-8
python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+33
-8
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+8
-0
python/paddle/fluid/tests/unittests/testsuite.py
python/paddle/fluid/tests/unittests/testsuite.py
+2
-2
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+16
-7
未找到文件。
cmake/cuda.cmake
浏览文件 @
48324c32
...
...
@@ -139,10 +139,12 @@ endfunction()
message
(
STATUS
"CUDA detected: "
${
CUDA_VERSION
}
)
if
(
${
CUDA_VERSION
}
LESS 7.0
)
set
(
paddle_known_gpu_archs
${
paddle_known_gpu_archs
}
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
60
\"
"
)
elseif
(
${
CUDA_VERSION
}
LESS 8.0
)
# CUDA 7.x
set
(
paddle_known_gpu_archs
${
paddle_known_gpu_archs7
}
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D_MWAITXINTRIN_H_INCLUDED"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D__STRICT_ANSI__"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
70
\"
"
)
elseif
(
${
CUDA_VERSION
}
LESS 9.0
)
# CUDA 8.x
set
(
paddle_known_gpu_archs
${
paddle_known_gpu_archs8
}
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D_MWAITXINTRIN_H_INCLUDED"
)
...
...
@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now.
list
(
APPEND CUDA_NVCC_FLAGS
"-Wno-deprecated-gpu-targets"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
80
\"
"
)
endif
()
include_directories
(
${
CUDA_INCLUDE_DIRS
}
)
...
...
cmake/cudnn.cmake
浏览文件 @
48324c32
...
...
@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
if
(
NOT CUDNN_MAJOR_VERSION
)
set
(
CUDNN_VERSION
"???"
)
else
()
add_definitions
(
"-DPADDLE_CUDNN_BINVER=
\"
${
CUDNN_MAJOR_VERSION
}
\"
"
)
math
(
EXPR CUDNN_VERSION
"
${
CUDNN_MAJOR_VERSION
}
* 1000 +
${
CUDNN_MINOR_VERSION
}
* 100 +
${
CUDNN_PATCHLEVEL_VERSION
}
"
)
...
...
cmake/external/cub.cmake
浏览文件 @
48324c32
...
...
@@ -32,4 +32,4 @@ endif()
add_dependencies
(
cub extern_cub
)
LIST
(
APPEND externl_project_dependencies cub
)
LIST
(
APPEND extern
a
l_project_dependencies cub
)
cmake/external/dlpack.cmake
浏览文件 @
48324c32
...
...
@@ -28,4 +28,4 @@ endif()
add_dependencies
(
dlpack extern_dlpack
)
LIST
(
APPEND externl_project_dependencies dlpack
)
LIST
(
APPEND extern
a
l_project_dependencies dlpack
)
cmake/operators.cmake
浏览文件 @
48324c32
...
...
@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here.
foreach
(
manual_pybind_op
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
"fusion_transpose_flatten_concat_op"
)
"fusion_transpose_flatten_concat_op"
"fusion_conv_inception_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
endif
()
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
48324c32
...
...
@@ -72,13 +72,13 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_library
(
threadpool SRCS threadpool.cc DEPS enforce
)
cc_test
(
threadpool_test SRCS threadpool_test.cc DEPS threadpool
)
cc_library
(
var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto
)
cc_library
(
var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto
)
if
(
WITH_GPU
)
target_link_libraries
(
var_type_traits dynload_cuda
)
endif
()
cc_test
(
var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits
)
cc_library
(
scope SRCS scope.cc DEPS glog threadpool var_type_traits
)
cc_library
(
scope SRCS scope.cc DEPS glog threadpool
xxhash
var_type_traits
)
cc_library
(
scope_pool SRCS scope_pool.cc DEPS scope
)
cc_test
(
scope_test SRCS scope_test.cc DEPS scope
)
cc_test
(
variable_test SRCS variable_test.cc DEPS tensor var_type_traits
)
...
...
@@ -189,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
fast_threaded_ssa_graph_executor variable_helper
)
if
(
WITH_PSLIB
)
cc_library
(
async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib
)
cc_library
(
async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib
timer
)
else
()
cc_library
(
async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper
)
cc_library
(
async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper
timer
)
endif
(
WITH_PSLIB
)
...
...
paddle/fluid/framework/async_executor.cc
浏览文件 @
48324c32
...
...
@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
// start executing ops in multiple threads
for
(
int
thidx
=
0
;
thidx
<
actual_thread_num
;
++
thidx
)
{
threads
.
push_back
(
std
::
thread
(
&
ExecutorThreadWorker
::
TrainFiles
,
workers
[
thidx
].
get
()));
if
(
debug
)
{
threads
.
push_back
(
std
::
thread
(
&
ExecutorThreadWorker
::
TrainFilesWithTimer
,
workers
[
thidx
].
get
()));
}
else
{
threads
.
push_back
(
std
::
thread
(
&
ExecutorThreadWorker
::
TrainFiles
,
workers
[
thidx
].
get
()));
}
}
for
(
auto
&
th
:
threads
)
{
...
...
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
48324c32
...
...
@@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() {
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way.
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
NoDummyInputSize
()
==
1
&&
local_scopes_
[
0
]
->
FindLocalVar
(
NCCL_ID_VARNAME
)
==
nullptr
)
{
#else
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
48324c32
...
...
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
size_t
num_threads_
{
0
};
bool
use_cuda_
{
true
};
bool
allow_op_delay_
{
false
};
size_t
num_iteration_per_drop_scope_
{
1
00
};
size_t
num_iteration_per_drop_scope_
{
1
};
ExecutorType
type_
{
kDefault
};
bool
dry_run_
{
false
};
};
...
...
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
浏览文件 @
48324c32
...
...
@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
}
platform
::
RecordEvent
e
(
"ScopeBufferedSSAGraphExecutorAfterRun"
,
nullptr
);
drop_scope_counter_
+=
1
;
++
drop_scope_counter_
;
if
(
!
fetch_tensors
.
empty
()
||
drop_scope_counter_
==
strategy_
.
num_iteration_per_drop_scope_
)
{
drop_scope_counter_
=
0
;
// Wait All computational streams
for
(
auto
p
:
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
bool
stream_end
=
false
;
if
(
!
fetch_tensors
.
empty
())
{
WaitComputationalStreams
();
stream_end
=
true
;
}
if
(
drop_scope_counter_
==
strategy_
.
num_iteration_per_drop_scope_
)
{
if
(
!
stream_end
)
{
WaitComputationalStreams
();
}
for
(
auto
&
scope
:
local_scopes_
)
{
auto
&
local_scope
=
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
();
scope
->
DeleteScope
(
local_scope
);
}
drop_scope_counter_
=
0
;
}
if
(
eptr
)
{
std
::
rethrow_exception
(
eptr
);
...
...
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
浏览文件 @
48324c32
...
...
@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
)
override
;
private:
inline
void
WaitComputationalStreams
()
{
// Wait All computational streams
for
(
auto
p
:
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
}
private:
size_t
drop_scope_counter_
{
0
};
...
...
paddle/fluid/framework/executor_thread_worker.cc
浏览文件 @
48324c32
...
...
@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
return
;
#else
static
unsigned
concurrency_cap
=
std
::
thread
::
hardware_concurrency
();
LOG
(
WARNING
)
<<
"concurrency capacity "
<<
concurrency_cap
;
int
thread_id
=
this
->
thread_id_
;
if
(
static_cast
<
unsigned
>
(
thread_id
)
<
concurrency_cap
)
{
...
...
@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
VLOG
(
1
)
<<
"print_fetch_var: unrecognized data type:"
<<
tensor
.
type
();
}
void
ExecutorThreadWorker
::
TrainFilesWithTimer
()
{
platform
::
SetNumThreads
(
1
);
SetDevice
();
thread_reader_
->
Start
();
std
::
vector
<
double
>
op_total_time
;
std
::
vector
<
std
::
string
>
op_name
;
for
(
auto
&
op
:
ops_
)
{
op_name
.
push_back
(
op
->
Type
());
}
op_total_time
.
resize
(
ops_
.
size
());
for
(
size_t
i
=
0
;
i
<
op_total_time
.
size
();
++
i
)
{
op_total_time
[
i
]
=
0.0
;
}
platform
::
Timer
timeline
;
double
total_time
=
0.0
;
double
read_time
=
0.0
;
int
cur_batch
;
int
batch_cnt
=
0
;
timeline
.
Start
();
while
((
cur_batch
=
thread_reader_
->
Next
())
>
0
)
{
timeline
.
Pause
();
read_time
+=
timeline
.
ElapsedSec
();
total_time
+=
timeline
.
ElapsedSec
();
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
timeline
.
Start
();
ops_
[
i
]
->
Run
(
*
thread_scope_
,
place_
);
timeline
.
Pause
();
op_total_time
[
i
]
+=
timeline
.
ElapsedSec
();
total_time
+=
timeline
.
ElapsedSec
();
}
++
batch_cnt
;
thread_scope_
->
DropKids
();
if
(
thread_id_
==
0
)
{
if
(
batch_cnt
>
0
&&
batch_cnt
%
1000
==
0
)
{
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
fprintf
(
stderr
,
"op_name:[%zu][%s], op_mean_time:[%fs]
\n
"
,
i
,
op_name
[
i
].
c_str
(),
op_total_time
[
i
]
/
batch_cnt
);
}
fprintf
(
stderr
,
"mean read time: %fs
\n
"
,
read_time
/
batch_cnt
);
int
fetch_var_num
=
fetch_var_names_
.
size
();
for
(
int
i
=
0
;
i
<
fetch_var_num
;
++
i
)
{
print_fetch_var
(
thread_scope_
,
fetch_var_names_
[
i
]);
}
}
}
timeline
.
Start
();
}
}
void
ExecutorThreadWorker
::
TrainFiles
()
{
platform
::
SetNumThreads
(
1
);
...
...
@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
std
::
shared_ptr
<
paddle
::
distributed
::
PSlib
>
pslib_ptr
)
{
_pslib_ptr
=
pslib_ptr
;
}
void
AsyncExecutorThreadWorker
::
SetPullDenseThread
(
std
::
shared_ptr
<
DensePullThread
>
dpt
)
{
_pull_dense_thread
=
dpt
;
}
void
AsyncExecutorThreadWorker
::
TrainOneNetwork
()
{
PrepareParams
();
...
...
paddle/fluid/framework/executor_thread_worker.h
浏览文件 @
48324c32
...
...
@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
void
SetDataFeed
(
const
std
::
shared_ptr
<
DataFeed
>&
datafeed
);
// A multi-thread training function
virtual
void
TrainFiles
();
// with timer log
virtual
void
TrainFilesWithTimer
();
// set fetch variable names from python interface assigned by users
void
SetFetchVarNames
(
const
std
::
vector
<
std
::
string
>&
fetch_var_names
);
#ifdef PADDLE_WITH_PSLIB
...
...
paddle/fluid/framework/rw_lock.h
浏览文件 @
48324c32
...
...
@@ -16,7 +16,9 @@ limitations under the License. */
#if !defined(_WIN32)
#include <pthread.h>
#endif // !_WIN32
#else
#include <mutex> // NOLINT
#endif // !_WIN32
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -29,17 +31,17 @@ struct RWLock {
~
RWLock
()
{
pthread_rwlock_destroy
(
&
lock_
);
}
void
RDLock
()
{
inline
void
RDLock
()
{
PADDLE_ENFORCE_EQ
(
pthread_rwlock_rdlock
(
&
lock_
),
0
,
"acquire read lock failed"
);
}
void
WRLock
()
{
inline
void
WRLock
()
{
PADDLE_ENFORCE_EQ
(
pthread_rwlock_wrlock
(
&
lock_
),
0
,
"acquire write lock failed"
);
}
void
UNLock
()
{
inline
void
UNLock
()
{
PADDLE_ENFORCE_EQ
(
pthread_rwlock_unlock
(
&
lock_
),
0
,
"unlock failed"
);
}
...
...
@@ -51,81 +53,46 @@ struct RWLock {
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
// In windows, rw_lock seems like a hack. Use empty object and do nothing.
struct
RWLock
{
void
RDLock
()
{}
void
WRLock
()
{}
void
UNLock
()
{}
// FIXME(minqiyang): use mutex here to do fake lock
inline
void
RDLock
()
{
mutex_
.
lock
();
}
inline
void
WRLock
()
{
mutex_
.
lock
();
}
inline
void
UNLock
()
{
mutex_
.
unlock
();
}
private:
std
::
mutex
mutex_
;
};
#endif
class
RWLockGuard
{
class
AutoWRLock
{
public:
enum
Status
{
kUnLock
,
kWRLock
,
kRDLock
};
RWLockGuard
(
RWLock
*
rw_lock
,
Status
init_status
)
:
lock_
(
rw_lock
),
status_
(
Status
::
kUnLock
)
{
switch
(
init_status
)
{
case
Status
::
kRDLock
:
{
RDLock
();
break
;
}
case
Status
::
kWRLock
:
{
WRLock
();
break
;
}
case
Status
::
kUnLock
:
{
break
;
}
}
}
explicit
AutoWRLock
(
RWLock
*
rw_lock
)
:
lock_
(
rw_lock
)
{
Lock
();
}
void
WRLock
()
{
switch
(
status_
)
{
case
Status
::
kUnLock
:
{
lock_
->
WRLock
();
status_
=
Status
::
kWRLock
;
break
;
}
case
Status
::
kWRLock
:
{
break
;
}
case
Status
::
kRDLock
:
{
PADDLE_THROW
(
"Please unlock read lock first before invoking write lock."
);
break
;
}
}
}
~
AutoWRLock
()
{
UnLock
();
}
void
RDLock
()
{
switch
(
status_
)
{
case
Status
::
kUnLock
:
{
lock_
->
RDLock
();
status_
=
Status
::
kRDLock
;
break
;
}
case
Status
::
kRDLock
:
{
break
;
}
case
Status
::
kWRLock
:
{
PADDLE_THROW
(
"Please unlock write lock first before invoking read lock."
);
break
;
}
}
}
private:
inline
void
Lock
()
{
lock_
->
WRLock
();
}
void
UnLock
()
{
if
(
status_
!=
Status
::
kUnLock
)
{
lock_
->
UNLock
();
status_
=
Status
::
kUnLock
;
}
}
inline
void
UnLock
()
{
lock_
->
UNLock
();
}
private:
RWLock
*
lock_
;
};
class
AutoRDLock
{
public:
explicit
AutoRDLock
(
RWLock
*
rw_lock
)
:
lock_
(
rw_lock
)
{
Lock
();
}
~
AutoRDLock
()
{
UnLock
();
}
private:
inline
void
Lock
()
{
lock_
->
RDLock
();
}
~
RWLockGuard
()
{
Un
Lock
();
}
inline
void
UnLock
()
{
lock_
->
UN
Lock
();
}
private:
RWLock
*
lock_
;
Status
status_
;
};
}
// namespace framework
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
48324c32
...
...
@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
// the mutex will cause serious performance issue.
// So the mutex is disabled when `ON_INFER`.
#ifdef PADDLE_ON_INFERENCE
#define SCOPE_LOCK_GUARD
#define SCOPE_KIDS_READER_LOCK
#define SCOPE_KIDS_WRITER_LOCK
#define SCOPE_VARS_READER_LOCK
#define SCOPE_VARS_WRITER_LOCK
#else
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
#endif
namespace
paddle
{
...
...
@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
Scope
::~
Scope
()
{
DropKids
();
}
Scope
&
Scope
::
NewScope
()
const
{
SCOPE_LOCK_GUARD
kids_
.
push_back
(
new
Scope
(
this
));
return
*
kids_
.
back
();
Scope
*
child
=
new
Scope
(
this
);
{
SCOPE_KIDS_WRITER_LOCK
kids_
.
push_back
(
child
);
}
return
*
child
;
}
Variable
*
Scope
::
Var
(
const
std
::
string
&
name
)
{
SCOPE_
LOCK_GUARD
SCOPE_
VARS_WRITER_LOCK
return
VarInternal
(
name
);
}
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
SCOPE_LOCK_GUARD
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
if
(
name
!=
nullptr
)
{
*
name
=
new_name
;
}
SCOPE_VARS_WRITER_LOCK
return
VarInternal
(
new_name
);
}
Variable
*
Scope
::
FindVar
(
const
std
::
string
&
name
)
const
{
SCOPE_
LOCK_GUARD
SCOPE_
VARS_READER_LOCK
return
FindVarInternal
(
name
);
}
Variable
*
Scope
::
FindLocalVar
(
const
std
::
string
&
name
)
const
{
SCOPE_
LOCK_GUARD
SCOPE_
VARS_READER_LOCK
return
FindVarLocally
(
name
);
}
const
Scope
*
Scope
::
FindScope
(
const
Variable
*
var
)
const
{
SCOPE_
LOCK_GUARD
SCOPE_
VARS_READER_LOCK
return
FindScopeInternal
(
var
);
}
void
Scope
::
DropKids
()
{
SCOPE_
LOCK_GUARD
SCOPE_
KIDS_WRITER_LOCK
for
(
Scope
*
s
:
kids_
)
delete
s
;
kids_
.
clear
();
}
bool
Scope
::
HasKid
(
const
Scope
*
scope
)
const
{
SCOPE_
LOCK_GUARD
SCOPE_
KIDS_READER_LOCK
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
return
it
!=
this
->
kids_
.
end
();
}
std
::
vector
<
std
::
string
>
Scope
::
LocalVarNames
()
const
{
SCOPE_LOCK_GUARD
std
::
vector
<
std
::
string
>
known_vars
;
known_vars
.
reserve
(
this
->
vars_
.
size
());
for
(
auto
&
p
:
vars_
)
{
known_vars
.
emplace_back
(
p
.
first
);
{
SCOPE_VARS_READER_LOCK
known_vars
.
reserve
(
this
->
vars_
.
size
());
for
(
auto
&
p
:
vars_
)
{
known_vars
.
emplace_back
(
p
.
first
);
}
}
return
known_vars
;
}
void
Scope
::
DeleteScope
(
Scope
*
scope
)
const
{
SCOPE_
LOCK_GUARD
SCOPE_
KIDS_WRITER_LOCK
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
PADDLE_ENFORCE
(
it
!=
this
->
kids_
.
end
(),
"%p Cannot find %p as kid scope"
,
this
,
scope
);
...
...
@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
}
void
Scope
::
EraseVars
(
const
std
::
vector
<
std
::
string
>&
var_names
)
{
SCOPE_LOCK_GUARD
std
::
set
<
std
::
string
>
var_set
(
var_names
.
begin
(),
var_names
.
end
());
SCOPE_VARS_WRITER_LOCK
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
if
(
var_set
.
find
(
it
->
first
)
!=
var_set
.
end
())
{
it
=
vars_
.
erase
(
it
);
...
...
@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void
Scope
::
Rename
(
const
std
::
string
&
origin_name
,
const
std
::
string
&
new_name
)
const
{
SCOPE_
LOCK_GUARD
SCOPE_
VARS_WRITER_LOCK
RenameInternal
(
origin_name
,
new_name
);
}
std
::
string
Scope
::
Rename
(
const
std
::
string
&
origin_name
)
const
{
SCOPE_
LOCK_GUARD
SCOPE_
VARS_WRITER_LOCK
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
RenameInternal
(
origin_name
,
new_name
);
return
new_name
;
...
...
paddle/fluid/framework/scope.h
浏览文件 @
48324c32
...
...
@@ -14,12 +14,18 @@ limitations under the License. */
#pragma once
extern
"C"
{
#include <xxhash.h>
}
#include <list>
#include <m
utex> // NOLINT
#include <m
emory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/macros.h"
...
...
@@ -95,7 +101,14 @@ class Scope {
std
::
string
Rename
(
const
std
::
string
&
origin_name
)
const
;
protected:
mutable
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
Variable
>>
vars_
;
struct
KeyHasher
{
std
::
size_t
operator
()(
const
std
::
string
&
key
)
const
{
return
XXH32
(
key
.
c_str
(),
key
.
size
(),
1
);
}
};
mutable
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
Variable
>
,
KeyHasher
>
vars_
;
private:
// Call Scope::NewScope for a sub-scope.
...
...
@@ -124,7 +137,8 @@ class Scope {
DISABLE_COPY_AND_ASSIGN
(
Scope
);
private:
mutable
std
::
mutex
mutex_
;
mutable
RWLock
kids_lock_
;
mutable
RWLock
vars_lock_
;
};
// Generate some debug string about the inherience structure of scope, quite
...
...
paddle/fluid/operators/conv_cudnn_op_cache.h
浏览文件 @
48324c32
...
...
@@ -19,6 +19,10 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_int64
(
cudnn_exhaustive_search_times
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
...
...
@@ -54,9 +59,14 @@ class AlgorithmsCache {
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
std
::
mutex
mutex_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
...
...
@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_fusion_op.cc
浏览文件 @
48324c32
...
...
@@ -28,6 +28,8 @@ namespace operators {
// x is Input,
// z is ResidualData,
// bias is Bias
// When `split_channels` is set, y will be splitted into multiple outputs,
// each output has split_channels[i] number of channels.
class
Conv2DFusionOpMaker
:
public
Conv2DOpMaker
{
protected:
void
Apply
()
override
{
...
...
@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
"The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
"'relux' , 'tanh', 'band_pass'"
)
.
SetDefault
(
"relu"
);
AddAttr
<
std
::
vector
<
int
>>
(
"split_channels"
,
"When `split_channels` are set, there will be multiple outputs, the "
"output size is equal to the number of `split_channels`."
)
.
SetDefault
({});
AddOutput
(
"Outputs"
,
"This Outputs is used when setting `split_channels`."
"Usually used to fuse conv with same input and same filter size, "
"padding, stride, dilation size."
)
.
AsDuplicable
()
.
AsDispensable
();
AddInput
(
"AlgoCache"
,
"The cache of convolution algorithm, a RAW type variable."
)
.
AsDispensable
();
AddAttr
<
int
>
(
"search_times"
,
"The number of exhaustive search times for convolution algorithm."
)
.
SetDefault
(
-
1
);
}
};
class
Conv2DFusionOpInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
"Input(Input) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Filter"
),
"Input(Filter) of ConvOp should not be null."
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
auto
filter_dims
=
ctx
->
GetInputDim
(
"Filter"
);
std
::
vector
<
int
>
strides
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dilations"
);
std
::
vector
<
int64_t
>
oshape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
oshape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Output"
),
"Output(Output) of ConvOp should not be null."
);
ctx
->
SetOutputDim
(
"Output"
,
framework
::
make_ddim
(
oshape
));
std
::
vector
<
int
>
channels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"split_channels"
);
if
(
channels
.
size
())
{
PADDLE_ENFORCE
(
ctx
->
HasOutputs
(
"Outputs"
),
"Output(Outputs) of ConvOp should not be null."
);
std
::
vector
<
framework
::
DDim
>
oshapes
;
oshapes
.
reserve
(
channels
.
size
());
for
(
size_t
i
=
0
;
i
<
channels
.
size
();
++
i
)
{
oshapes
.
push_back
({
oshape
[
0
],
channels
[
i
],
oshape
[
2
],
oshape
[
3
]});
}
ctx
->
SetOutputsDim
(
"Outputs"
,
oshapes
);
}
}
};
// TODO(qingqing): add gradient operator for conv2d_fusion
}
// namespace operators
...
...
@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
conv2d_fusion
,
ops
::
ConvOp
,
ops
::
Conv2DFusionOpMaker
,
ops
::
ConvOpInferVarType
,
paddle
::
framework
::
EmptyGradOpMaker
);
ops
::
Conv2DFusionOpInferShape
,
ops
::
ConvOpInferVarType
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
48324c32
...
...
@@ -16,8 +16,9 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DEFINE_int64
(
cudnn_exhaustive_search_times
,
-
1
,
"Exhaustive search times for cuDNN convolution, "
"defalut is 1, only search once."
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
{
auto
search_func
=
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
fwd_perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
algo
<<
": "
<<
stat
.
status
<<
" "
<<
stat
.
time
<<
" "
<<
stat
.
memory
;
}
return
fwd_perf_stat
[
0
].
algo
;
};
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
int
search_times
=
ctx
.
Attr
<
int
>
(
"search_times"
);
search_times
=
std
::
max
(
static_cast
<
int
>
(
FLAGS_cudnn_exhaustive_search_times
),
search_times
);
if
(
search_times
>
0
)
{
// The searched algo will be cached by `search_times` times for
// different input dimension. For other dimensions, select the algo
// of closest area.
auto
var_name
=
ctx
.
Inputs
(
"AlgoCache"
)[
0
];
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCach
e
)
.
FindVar
(
var_nam
e
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
search_func
);
}
else
{
algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
// Cache searched algo in Var(kCUDNNFwdAlgoCache).
// all conv ops use the same kCUDNNFwdAlgoCache variable.
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
// TODO(qingqing) remove const_cast
algo_cache
=
const_cast
<
framework
::
Scope
*>
(
ctx
.
scope
().
parent
())
->
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
search_func
);
}
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
fwd_perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
algo
<<
": "
<<
stat
.
status
<<
" "
<<
stat
.
time
<<
" "
<<
stat
.
memory
;
}
return
fwd_perf_stat
[
0
].
algo
;
});
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
}
...
...
@@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
std
::
vector
<
int
>
channels
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"split_channels"
);
if
(
channels
.
size
())
{
auto
outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Outputs"
);
if
(
x_dims
[
0
]
==
1
)
{
// share data with Output
framework
::
Tensor
t
;
t
.
ShareDataWith
(
*
output
);
auto
y_dims
=
output
->
dims
();
t
.
Resize
({
y_dims
[
1
],
y_dims
[
2
],
y_dims
[
3
]});
int
s
=
0
;
for
(
size_t
i
=
0
;
i
<
channels
.
size
();
++
i
)
{
int
e
=
s
+
channels
[
i
];
outs
[
i
]
->
ShareDataWith
(
t
.
Slice
(
s
,
e
));
outs
[
i
]
->
Resize
({
x_dims
[
0
],
channels
[
i
],
y_dims
[
2
],
y_dims
[
3
]});
s
=
e
;
}
}
else
{
// TODO(qingiqng): do copy when batch size large than 1
PADDLE_THROW
(
"Batch size greater than 1 is Unsupported"
);
}
}
}
};
#endif
...
...
paddle/fluid/operators/distributed/collective_server_test.cc
浏览文件 @
48324c32
...
...
@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
framework
::
Scope
*
scope
=
new
framework
::
Scope
();
framework
::
Variable
*
var
=
scope
->
Var
(
"var1"
);
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
set_height
(
1
000
);
slr
->
set_height
(
20
000
);
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
rows
=
slr
->
mutable_rows
();
tensor
->
Resize
(
framework
::
make_ddim
({
3
,
5
}));
tensor
->
Resize
(
framework
::
make_ddim
({
20000
,
1024
}));
tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
operators
::
math
::
set_constant
(
ctx
,
tensor
,
32.7
);
...
...
@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
}
TEST
(
PREFETCH
,
GPU
)
{
setenv
(
"FLAGS_max_body_size"
,
"2147483647"
,
1
);
platform
::
CUDAPlace
place
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
place
);
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
48324c32
include
(
operators
)
register_operators
(
EXCLUDES fusion_transpose_flatten_concat_op
)
register_operators
(
EXCLUDES fusion_transpose_flatten_concat_op
fusion_conv_inception_op
)
if
(
WITH_GPU
)
op_library
(
fusion_transpose_flatten_concat_op
)
op_library
(
fusion_conv_inception_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(conv2d_inception_fusion);
\n
"
)
endif
()
paddle/fluid/operators/fused/fusion_conv_inception_op.cc
0 → 100644
浏览文件 @
48324c32
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
class
ConvInceptionFusionOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
// 1 x
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
// 4 filters
auto
w_dims
=
ctx
->
GetInputsDim
(
"Filter"
);
PADDLE_ENFORCE
(
in_dims
.
size
(),
4
,
"Conv intput should be 4-D tensor."
);
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
4
,
"There should be 4 filters"
);
PADDLE_ENFORCE_EQ
(
w_dims
[
0
][
1
],
in_dims
[
1
]);
PADDLE_ENFORCE_EQ
(
w_dims
[
1
][
1
],
in_dims
[
1
]);
int
n
=
in_dims
[
0
];
// compute output channel
// 1st channel
int
c
=
w_dims
[
0
][
0
];
// add 2nd channel
c
+=
(
w_dims
[
1
][
0
]
-
w_dims
[
2
][
1
]
*
2
);
// add 3rd channel
c
+=
(
w_dims
[
2
][
0
]
-
w_dims
[
3
][
1
]);
// add 4-th channel
c
+=
w_dims
[
3
][
0
];
int
h
=
in_dims
[
2
];
int
w
=
in_dims
[
3
];
ctx
->
SetOutputDim
(
"Output"
,
{
n
,
c
,
h
,
w
});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Input"
)
->
type
(),
ctx
.
device_context
());
}
};
class
ConvInceptionFusionOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
protected:
void
Make
()
override
{
AddInput
(
"Input"
,
"(Tensor) NCHW layout."
);
AddInput
(
"Filter"
,
"(vector<Tensor>) 4 aggregated filters"
).
AsDuplicable
();
AddInput
(
"Bias"
,
"(vector<Tensor>) it's lenght is equal to Filter"
)
.
AsDuplicable
();
AddOutput
(
"Output"
,
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW."
);
AddOutput
(
"TempOutput"
,
""
).
AsDuplicable
();
AddAttr
<
std
::
string
>
(
"pooling_type"
,
"(string), pooling type, can be
\"
max
\"
for max-pooling "
"and
\"
avg
\"
for average-pooling."
)
.
InEnum
({
"max"
,
"avg"
});
AddAttr
<
bool
>
(
"exclusive"
,
"(bool, default True) When true, will exclude the zero-padding in the "
"averaging calculating, otherwise, include the zero-padding. Note, it "
"is only used when pooling_type is avg. The defalut is True."
)
.
SetDefault
(
true
);
AddAttr
<
std
::
string
>
(
"activation"
,
"The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
"'relux' , 'tanh', 'band_pass'"
)
.
SetDefault
(
"relu"
);
AddAttr
<
int
>
(
"workspace_size_MB"
,
"Only used in cudnn kernel. Need set use_cudnn to true."
"workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be "
"allocated/freed each time the operator runs, larger "
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully."
)
.
SetDefault
(
4096
);
AddComment
(
R"DOC(
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
conv2d_inception_fusion
,
ops
::
ConvInceptionFusionOp
,
ops
::
ConvInceptionFusionOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
0 → 100644
浏览文件 @
48324c32
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
namespace
paddle
{
namespace
operators
{
#if CUDNN_VERSION >= 7001
using
Tensor
=
framework
::
Tensor
;
using
ScopedTensorDescriptor
=
platform
::
ScopedTensorDescriptor
;
using
ScopedFilterDescriptor
=
platform
::
ScopedFilterDescriptor
;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
ScopedActivationDescriptor
=
platform
::
ScopedActivationDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
ScopedPoolingDescriptor
=
platform
::
ScopedPoolingDescriptor
;
using
PoolingMode
=
platform
::
PoolingMode
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
template
<
typename
T
>
using
CudnnDataType
=
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
class
CUDNNConvInceptionFusionOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
filters
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Filter"
);
auto
bias
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Bias"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
auto
temp_outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"TempOutput"
);
const
std
::
string
pool_type
=
ctx
.
Attr
<
std
::
string
>
(
"pooling_type"
);
const
std
::
string
activation
=
ctx
.
Attr
<
std
::
string
>
(
"activation"
);
const
bool
exclusive
=
ctx
.
Attr
<
bool
>
(
"exclusive"
);
int64_t
user_workspace_size
=
static_cast
<
size_t
>
(
ctx
.
Attr
<
int
>
(
"workspace_size_MB"
));
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
temp_data
=
temp_outs
[
0
]
->
mutable_data
<
T
>
(
input
->
dims
(),
ctx
.
GetPlace
());
DataLayout
layout
=
DataLayout
::
kNCHW
;
std
::
vector
<
int
>
in_dim
=
framework
::
vectorize2int
(
input
->
dims
());
// ------------------- cudnn descriptors ---------------------
PoolingMode
pooling_mode
;
if
(
pool_type
==
"max"
)
{
pooling_mode
=
PoolingMode
::
kMaximum
;
}
else
{
pooling_mode
=
exclusive
?
PoolingMode
::
kAverageExclusive
:
(
PoolingMode
::
kAverageInclusive
);
}
std
::
vector
<
int
>
k0x0
=
{
0
,
0
};
std
::
vector
<
int
>
k1x1
=
{
1
,
1
};
std
::
vector
<
int
>
k1x1_2
=
{
1
,
1
};
std
::
vector
<
int
>
k3x3
=
{
3
,
3
};
ScopedPoolingDescriptor
pool_desc
;
ScopedActivationDescriptor
act_desc
;
ScopedTensorDescriptor
out_pool_desc
;
ScopedTensorDescriptor
input_desc
;
cudnnPoolingDescriptor_t
cudnn_pool_desc
=
pool_desc
.
descriptor
(
pooling_mode
,
k3x3
,
k1x1
,
k1x1
);
cudnnTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize2int
(
input
->
dims
()));
cudnnTensorDescriptor_t
pool_out_desc
=
out_pool_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize2int
(
input
->
dims
()));
cudnnDataType_t
cudnn_dtype
=
CudnnDataType
<
T
>::
type
;
cudnnTensorDescriptor_t
*
out_desc
=
new
cudnnTensorDescriptor_t
[
4
];
cudnnFilterDescriptor_t
*
filter_desc
=
new
cudnnFilterDescriptor_t
[
4
];
cudnnTensorDescriptor_t
*
bias_desc
=
new
cudnnTensorDescriptor_t
[
4
];
cudnnTensorDescriptor_t
*
in_desc
=
new
cudnnTensorDescriptor_t
[
4
];
cudnnConvolutionDescriptor_t
*
conv_desc
=
new
cudnnConvolutionDescriptor_t
[
4
];
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
filter_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bias_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
out_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateConvolutionDescriptor
(
&
conv_desc
[
i
]));
}
std
::
vector
<
std
::
vector
<
int
>>
filter_dims
;
std
::
vector
<
std
::
vector
<
int
>>
bias_dims
;
std
::
vector
<
std
::
vector
<
int
>>
in_dims
;
std
::
vector
<
std
::
vector
<
int
>>
out_dims
;
std
::
vector
<
std
::
vector
<
int
>>
in_strides
;
std
::
vector
<
std
::
vector
<
int
>>
out_strides
;
std
::
vector
<
std
::
vector
<
int
>>
bias_strides
;
cudnnTensorFormat_t
format
=
CUDNN_TENSOR_NCHW
;
int
n
=
in_dim
[
0
];
int
h
=
in_dim
[
2
];
int
w
=
in_dim
[
3
];
int
oc
=
output
->
dims
()[
1
];
cudnnDataType_t
compute_type
=
(
cudnn_dtype
==
CUDNN_DATA_DOUBLE
)
?
CUDNN_DATA_DOUBLE
:
CUDNN_DATA_FLOAT
;
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
filter_dims
.
push_back
(
framework
::
vectorize2int
(
filters
[
i
]
->
dims
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
filter_desc
[
i
],
cudnn_dtype
,
format
,
4
,
filter_dims
[
i
].
data
()));
bias_dims
.
push_back
({
1
,
filter_dims
[
i
][
0
],
1
,
1
});
bias_strides
.
push_back
({
filter_dims
[
i
][
0
],
1
,
1
,
1
});
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
bias_desc
[
i
],
cudnn_dtype
,
4
,
bias_dims
[
i
].
data
(),
bias_strides
[
i
].
data
()));
in_dims
.
push_back
({
n
,
filter_dims
[
i
][
1
],
h
,
w
});
out_dims
.
push_back
({
n
,
filter_dims
[
i
][
0
],
h
,
w
});
in_strides
.
push_back
({
filter_dims
[
i
][
1
]
*
h
*
w
,
h
*
w
,
w
,
1
});
out_strides
.
push_back
({
oc
*
h
*
w
,
h
*
w
,
w
,
1
});
if
(
i
<
2
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionNdDescriptor
(
conv_desc
[
i
],
2
,
k0x0
.
data
(),
k1x1
.
data
(),
k1x1
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionNdDescriptor
(
conv_desc
[
i
],
2
,
k1x1
.
data
(),
k1x1
.
data
(),
k1x1
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
conv_desc
[
i
],
CUDNN_DEFAULT_MATH
));
}
in_dims
[
2
][
1
]
*=
2
;
in_strides
[
2
][
0
]
=
oc
*
h
*
w
;
out_strides
[
2
][
0
]
=
filter_dims
[
2
][
0
]
*
h
*
w
;
// this out is continuous.
in_strides
[
3
][
0
]
=
filter_dims
[
2
][
0
]
*
h
*
w
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
conv_desc
[
2
],
2
));
cudnnConvolutionFwdAlgo_t
algo
[
4
];
auto
handle
=
dev_ctx
.
cudnn_handle
();
size_t
workspace_size_in_bytes
=
0
;
// final workspace to allocate.
size_t
workspace_size_limit
=
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
;
if
(
FLAGS_conv_workspace_size_limit
>
0
||
user_workspace_size
>
0
)
{
int64_t
max_user_size
=
std
::
max
(
static_cast
<
int64_t
>
(
FLAGS_conv_workspace_size_limit
),
user_workspace_size
);
workspace_size_limit
=
max_user_size
*
1024
*
1024
;
}
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
in_desc
[
i
],
cudnn_dtype
,
4
,
in_dims
[
i
].
data
(),
in_strides
[
i
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
[
i
],
cudnn_dtype
,
4
,
out_dims
[
i
].
data
(),
out_strides
[
i
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
[
i
]));
size_t
tmp_size
=
0
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
algo
[
i
],
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
cudnnActivationDescriptor_t
cudnn_act_desc
=
act_desc
.
descriptor
<
T
>
(
activation
);
int
oc0
=
filter_dims
[
0
][
0
];
int
oc1
=
filter_dims
[
1
][
0
]
-
filter_dims
[
2
][
1
]
*
2
;
int
oc3
=
filter_dims
[
3
][
0
];
int
oc2
=
oc
-
oc0
-
oc1
-
oc3
;
// branch1: pool + 1x1 conv
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnPoolingForward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_input_desc
,
input_data
,
&
beta
,
pool_out_desc
,
temp_data
));
std
::
vector
<
const
void
*>
in_datas
;
in_datas
.
push_back
(
static_cast
<
const
void
*>
(
temp_data
));
in_datas
.
push_back
(
static_cast
<
const
void
*>
(
input_data
));
in_datas
.
push_back
(
static_cast
<
const
void
*>
(
output_data
+
(
oc0
+
oc1
)
*
h
*
w
));
T
*
temp2_data
=
temp_outs
[
1
]
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
out_dims
[
2
]),
ctx
.
GetPlace
());
in_datas
.
push_back
(
static_cast
<
const
void
*>
(
temp2_data
+
oc2
*
h
*
w
));
std
::
vector
<
void
*>
out_datas
;
out_datas
.
push_back
(
static_cast
<
void
*>
(
output_data
));
out_datas
.
push_back
(
static_cast
<
void
*>
(
output_data
+
oc0
*
h
*
w
));
out_datas
.
push_back
(
static_cast
<
void
*>
(
temp2_data
));
out_datas
.
push_back
(
static_cast
<
void
*>
(
output_data
+
(
oc0
+
oc1
+
oc2
)
*
h
*
w
));
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
auto
func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha
,
in_desc
[
i
],
in_datas
[
i
],
filter_desc
[
i
],
static_cast
<
const
void
*>
(
filters
[
i
]
->
data
<
T
>
()),
conv_desc
[
i
],
algo
[
i
],
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
out_desc
[
i
],
out_datas
[
i
],
bias_desc
[
i
],
static_cast
<
const
void
*>
(
bias
[
i
]
->
data
<
T
>
()),
cudnn_act_desc
,
out_desc
[
i
],
out_datas
[
i
]));
};
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
workspace_handle
.
RunFunc
(
func
,
workspace_size_in_bytes
);
}
cudnnTensorDescriptor_t
x_desc
;
cudnnTensorDescriptor_t
y_desc
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
x_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
y_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
x_desc
,
cudnn_dtype
,
4
,
out_dims
[
3
].
data
(),
out_strides
[
2
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
y_desc
,
cudnn_dtype
,
4
,
out_dims
[
3
].
data
(),
out_strides
[
3
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnTransformTensor
(
handle
,
CudnnDataType
<
T
>::
kOne
(),
x_desc
,
static_cast
<
const
void
*>
(
out_datas
[
2
]),
CudnnDataType
<
T
>::
kZero
(),
y_desc
,
static_cast
<
void
*>
(
output_data
+
(
oc0
+
oc1
)
*
h
*
w
)));
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
out_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
filter_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bias_desc
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyConvolutionDescriptor
(
conv_desc
[
i
]));
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
x_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
y_desc
));
}
};
#endif
}
// namespace operators
}
// namespace paddle
#if CUDNN_VERSION >= 7001
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
conv2d_inception_fusion
,
ops
::
CUDNNConvInceptionFusionOpKernel
<
float
>
,
ops
::
CUDNNConvInceptionFusionOpKernel
<
double
>
);
#endif
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
48324c32
...
...
@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
cc_library
(
timer SRCS timer.cc
)
cc_test
(
timer_test SRCS timer_test.cc DEPS timer
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
cc_library
(
profiler SRCS profiler.cc DEPS device_context device_tracer
)
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
...
...
paddle/fluid/platform/dynload/cudnn.cc
浏览文件 @
48324c32
...
...
@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R5
(
DEFINE_WRAP
);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R6
CUDNN_DNN_ROUTINE_EACH_R6
(
DEFINE_WRAP
);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R7
CUDNN_DNN_ROUTINE_EACH_R7
(
DEFINE_WRAP
);
#endif
...
...
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
48324c32
...
...
@@ -53,6 +53,12 @@ namespace platform {
namespace
dynload
{
static
constexpr
char
cupti_lib_path
[]
=
CUPTI_LIB_PATH
;
#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
static
constexpr
char
*
win_cublas_lib
=
"cublas64_"
PADDLE_CUDA_BINVER
".dll"
;
static
constexpr
char
*
win_curand_lib
=
"curand64_"
PADDLE_CUDA_BINVER
".dll"
;
static
constexpr
char
*
win_cudnn_lib
=
"cudnn64_"
PADDLE_CUDNN_BINVER
".dll"
;
#endif
static
inline
std
::
string
join
(
const
std
::
string
&
part1
,
const
std
::
string
&
part2
)
{
// directory separator
...
...
@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
void
*
GetCublasDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcublas.dylib"
);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
win_cublas_lib
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcublas.so"
);
#endif
...
...
@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() {
void
*
GetCUDNNDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_cudnn_dir
,
"libcudnn.dylib"
,
false
);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return
GetDsoHandleFromSearchPath
(
FLAGS_cudnn_dir
,
win_cudnn_lib
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_cudnn_dir
,
"libcudnn.so"
,
false
);
#endif
...
...
@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() {
void
*
GetCurandDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcurand.dylib"
);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
win_curand_lib
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcurand.so"
);
#endif
...
...
paddle/fluid/platform/timer.cc
0 → 100644
浏览文件 @
48324c32
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/timer.h"
namespace
paddle
{
namespace
platform
{
void
Timer
::
Reset
()
{
_start
.
tv_sec
=
0
;
_start
.
tv_usec
=
0
;
_count
=
0
;
_elapsed
=
0
;
_paused
=
true
;
}
void
Timer
::
Start
()
{
Reset
();
Resume
();
}
void
Timer
::
Pause
()
{
if
(
_paused
)
{
return
;
}
_elapsed
+=
Tickus
();
++
_count
;
_paused
=
true
;
}
void
Timer
::
Resume
()
{
gettimeofday
(
&
_start
,
NULL
);
_paused
=
false
;
}
int
Timer
::
Count
()
{
return
_count
;
}
double
Timer
::
ElapsedUS
()
{
return
static_cast
<
double
>
(
_elapsed
);
}
double
Timer
::
ElapsedMS
()
{
return
_elapsed
/
1000.0
;
}
double
Timer
::
ElapsedSec
()
{
return
_elapsed
/
1000000.0
;
}
int64_t
Timer
::
Tickus
()
{
gettimeofday
(
&
_now
,
NULL
);
return
(
_now
.
tv_sec
-
_start
.
tv_sec
)
*
1000
*
1000L
+
(
_now
.
tv_usec
-
_start
.
tv_usec
);
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/timer.h
0 → 100644
浏览文件 @
48324c32
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "paddle/fluid/platform/port.h"
#ifdef _WIN32
static
unsigned
sleep
(
unsigned
seconds
)
{
Sleep
(
seconds
*
1000
);
return
0
;
}
#endif
namespace
paddle
{
namespace
platform
{
// A Standard Timer implementation for debugging
class
Timer
{
public:
// a timer class for profiling
// Reset() will be called during initialization
// all timing variables will be set 0 in Reset()
Timer
()
{
Reset
();
}
void
Reset
();
void
Start
();
void
Pause
();
// Resume will get current system time
void
Resume
();
int
Count
();
// return elapsed time in us
double
ElapsedUS
();
// return elapsed time in ms
double
ElapsedMS
();
// return elapsed time in sec
double
ElapsedSec
();
private:
struct
timeval
_start
;
struct
timeval
_now
;
int
_count
;
int
_elapsed
;
bool
_paused
;
// get us difference between start and now
int64_t
Tickus
();
};
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/timer_test.cc
0 → 100644
浏览文件 @
48324c32
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/timer.h"
#include "gtest/gtest.h"
TEST
(
Timer
,
Reset
)
{
paddle
::
platform
::
Timer
timeline
;
timeline
.
Start
();
sleep
(
3
);
timeline
.
Pause
();
timeline
.
Reset
();
}
TEST
(
Timer
,
Start
)
{
paddle
::
platform
::
Timer
timeline
;
timeline
.
Start
();
sleep
(
3
);
timeline
.
Pause
();
}
TEST
(
Timer
,
Pause
)
{
paddle
::
platform
::
Timer
timeline
;
timeline
.
Start
();
sleep
(
3
);
timeline
.
Pause
();
}
TEST
(
Timer
,
Resume
)
{
paddle
::
platform
::
Timer
timeline
;
timeline
.
Start
();
sleep
(
3
);
timeline
.
Pause
();
timeline
.
Resume
();
}
paddle/fluid/pybind/pybind.cc
浏览文件 @
48324c32
...
...
@@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() {
}
bool
IsCompiledWithBrpc
()
{
#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
return
true
;
#else
#ifndef PADDLE_WITH_DISTRIBUTE
return
false
;
#endif
#ifdef PADDLE_WITH_GRPC
return
false
;
#endif
return
true
;
}
bool
IsCompiledWithDIST
()
{
...
...
paddle/testing/paddle_gtest_main.cc
浏览文件 @
48324c32
...
...
@@ -28,20 +28,53 @@ int main(int argc, char** argv) {
for
(
int
i
=
0
;
i
<
argc
;
++
i
)
{
new_argv
.
push_back
(
argv
[
i
]);
}
std
::
vector
<
std
::
string
>
envs
;
std
::
vector
<
std
::
string
>
undefok
;
#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
envs
.
push_back
(
"max_body_size"
);
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv
.
push_back
(
strdup
(
"--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"
)
);
envs
.
push_back
(
"fraction_of_gpu_memory_to_use"
);
envs
.
push_back
(
"allocator_strategy"
);
#elif __clang__
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
"mb,allocator_strategy"
));
new_argv
.
push_back
(
strdup
(
"--undefok=use_mkldnn,initial_cpu_memory_in_mb"
));
envs
.
push_back
(
"use_mkldnn"
);
envs
.
push_back
(
"initial_cpu_memory_in_mb"
);
envs
.
push_back
(
"allocator_strategy"
);
undefok
.
push_back
(
"use_mkldnn"
);
undefok
.
push_back
(
"initial_cpu_memory_in_mb"
);
#else
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
"mb,allocator_strategy"
));
new_argv
.
push_back
(
strdup
(
"--undefok=use_mkldnn,initial_cpu_memory_in_mb"
));
envs
.
push_back
(
"use_pinned_memory"
);
envs
.
push_back
(
"use_mkldnn"
);
envs
.
push_back
(
"initial_cpu_memory_in_mb"
);
envs
.
push_back
(
"allocator_strategy"
);
undefok
.
push_back
(
"use_mkldnn"
);
undefok
.
push_back
(
"initial_cpu_memory_in_mb"
);
#endif
if
(
envs
.
size
()
>
0
)
{
std
::
string
env_string
=
"--tryfromenv="
;
for
(
auto
t
:
envs
)
{
env_string
+=
t
+
","
;
}
env_string
=
env_string
.
substr
(
0
,
env_string
.
length
()
-
1
);
new_argv
.
push_back
(
strdup
(
env_string
.
c_str
()));
VLOG
(
1
)
<<
"gtest env_string:"
<<
env_string
;
}
if
(
undefok
.
size
()
>
0
)
{
std
::
string
undefok_string
=
"--undefok="
;
for
(
auto
t
:
undefok
)
{
undefok_string
+=
t
+
","
;
}
undefok_string
=
undefok_string
.
substr
(
0
,
undefok_string
.
length
()
-
1
);
new_argv
.
push_back
(
strdup
(
undefok_string
.
c_str
()));
VLOG
(
1
)
<<
"gtest undefok_string:"
<<
undefok_string
;
}
int
new_argc
=
static_cast
<
int
>
(
new_argv
.
size
());
char
**
new_argv_address
=
new_argv
.
data
();
google
::
ParseCommandLineFlags
(
&
new_argc
,
&
new_argv_address
,
false
);
...
...
python/paddle/fluid/__init__.py
浏览文件 @
48324c32
...
...
@@ -151,12 +151,21 @@ def __bootstrap__():
read_env_flags
.
append
(
'rpc_get_thread_num'
)
read_env_flags
.
append
(
'rpc_prefetch_thread_num'
)
read_env_flags
.
append
(
'rpc_disable_reuse_port'
)
if
core
.
is_compiled_with_brpc
():
read_env_flags
.
append
(
'max_body_size'
)
#set brpc max body size
os
.
environ
[
'FLAGS_max_body_size'
]
=
"2147483647"
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'cudnn_exhaustive_search_times'
,
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
python/paddle/fluid/data_feeder.py
浏览文件 @
48324c32
...
...
@@ -272,8 +272,7 @@ class DataFeeder(object):
dict: the result of conversion.
Raises:
ValueError: If drop_last is False and the data batch which cannot
fit for devices.
ValueError: If drop_last is False and the data batch which cannot fit for devices.
"""
def
__reader_creator__
():
...
...
python/paddle/fluid/framework.py
浏览文件 @
48324c32
...
...
@@ -647,20 +647,16 @@ class Operator(object):
self
.
desc
.
set_input
(
in_proto
.
name
,
[])
if
outputs
is
not
None
:
given
=
set
()
need
=
set
()
for
n
in
outputs
:
given
.
add
(
n
)
for
m
in
proto
.
outputs
:
need
.
add
(
m
.
name
)
if
not
given
==
need
:
raise
ValueError
((
"Incorrect setting for output(s) of "
"operator
\"
%s
\"
. Need: [%s] Given: [%s]"
)
%
(
type
,
", "
.
join
(
six
.
binary_type
(
e
)
for
e
in
need
),
", "
.
join
(
six
.
binary_type
(
e
)
for
e
in
given
)))
if
(
m
.
name
not
in
outputs
)
and
m
.
dispensable
:
continue
if
not
((
m
.
name
in
outputs
)
or
m
.
dispensable
):
raise
ValueError
(
(
"Incorrect setting for output(s) of "
"operator
\"
%s
\"
, should set: [%s]."
)
%
(
type
,
m
.
name
))
for
out_proto
in
proto
.
outputs
:
if
out_proto
.
name
not
in
outputs
:
continue
out_args
=
outputs
[
out_proto
.
name
]
if
not
isinstance
(
out_args
,
list
):
out_args
=
[
out_args
]
...
...
@@ -1638,8 +1634,8 @@ class Program(object):
parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
to print.
Returns
(str)
: The debug string.
Returns
:
str
: The debug string.
Raises:
ValueError: If any of required fields is not set and throw_on_error is
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
48324c32
...
...
@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
def
step_input
(
self
,
x
):
"""
Mark a sequence as a dynamic RNN input.
Args:
x(Variable): The input sequence.
...
...
@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
"""
Mark a variable as a RNN input. The input will not be scattered into
time steps.
Args:
x(Variable): The input variable.
...
...
@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
Args:
init(Variable|None): The initialized variable.
shape(list|tuple): The memory shape. NOTE the shape does not contain
batch_size.
shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
value(float): the initalized value.
need_reorder(bool): True if the initialized memory depends on the
input sample.
need_reorder(bool): True if the initialized memory depends on the input sample.
dtype(str|numpy.dtype): The data type of the initialized memory.
...
...
@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
"""
Update the memory from ex_mem to new_mem. NOTE that the shape and data
type of :code:`ex_mem` and :code:`new_mem` must be same.
Args:
ex_mem(Variable): the memory variable.
new_mem(Variable): the plain variable generated in RNN block.
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
48324c32
...
...
@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
rpn_negative_overlap
=
0.3
,
use_random
=
True
):
"""
**
Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.
**
**
Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.
**
This layer can be, for given the Intersection-over-Union (IoU) overlap
between anchors and ground truth boxes, to assign classification and
...
...
@@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred,
Examples:
.. code-block:: python
bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
append_batch_size=False, dtype='float32')
cls_logits = layers.data(name='cls_logits', shape=[100, 1],
append_batch_size=False, dtype='float32')
anchor_box = layers.data(name='anchor_box', shape=[20, 4],
append_batch_size=False, dtype='float32')
gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32')
loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
cls_logits=cls_logits,
anchor_box=anchor_box,
gt_boxes=gt_boxes)
bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
append_batch_size=False, dtype='float32')
cls_logits = layers.data(name='cls_logits', shape=[100, 1],
append_batch_size=False, dtype='float32')
anchor_box = layers.data(name='anchor_box', shape=[20, 4],
append_batch_size=False, dtype='float32')
gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32')
loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
cls_logits=cls_logits,
anchor_box=anchor_box,
gt_boxes=gt_boxes)
"""
helper
=
LayerHelper
(
'rpn_target_assign'
,
**
locals
())
...
...
@@ -1519,27 +1520,30 @@ def anchor_generator(input,
Args:
input(Variable): The input feature map, the format is NCHW.
anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
given in absolute pixels e.g. [64., 128., 256., 512.].
For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
given in absolute pixels e.g. [64., 128., 256., 512.].
For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
aspect_ratios(list|tuple|float): The height / width ratios of generated
anchors, e.g. [0.5, 1.0, 2.0].
anchors, e.g. [0.5, 1.0, 2.0].
variance(list|tuple): The variances to be used in box regression deltas.
Default:[0.1, 0.1, 0.2, 0.2].
stride(list|turple): The anchors stride across width and height,
e.g. [16.0, 16.0]
Default:[0.1, 0.1, 0.2, 0.2].
stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
offset(float): Prior boxes center offset. Default: 0.5
name(str): Name of the prior box op. Default: None.
Returns:
Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].
H is the height of input, W is the width of input,
num_anchors is the box count of each position.
Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
Variances(Variable): The expanded variances of anchors
with a layout of [H, W, num_priors, 4].
H is the height of input, W is the width of input
num_anchors is the box count of each position.
Each variance is in (xcenter, ycenter, w, h) format.
Anchors(Variable),Variances(Variable):
two variables:
- Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].
\
H is the height of input, W is the width of input,
\
num_anchors is the box count of each position.
\
Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
- Variances(Variable): The expanded variances of anchors
\
with a layout of [H, W, num_priors, 4].
\
H is the height of input, W is the width of input
\
num_anchors is the box count of each position.
\
Each variance is in (xcenter, ycenter, w, h) format.
Examples:
...
...
@@ -1748,35 +1752,35 @@ def generate_proposals(scores,
eta
=
1.0
,
name
=
None
):
"""
**
Generate proposal Faster-RCNN
**
This operation proposes RoIs according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
could be used to train detection net.
For generating proposals, this operation performs following steps:
1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
2. Calculate box locations as proposals candidates.
3. Clip boxes to image
4. Remove predicted boxes with small area.
5. Apply NMS to get final proposals as output.
Args:
scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object
.
N is batch size, A is number of anchors, H and W are height and width of the feature map.
bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location.
im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
between origin image size and the size of feature map.
anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized
.
variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) forma
t.
pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6
000 by default.
post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000
by default.
nms_thresh(float): Threshold in NMS, 0.5
by default.
min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default
.
eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
**
Generate proposal Faster-RCNN
**
This operation proposes RoIs according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
could be used to train detection net.
For generating proposals, this operation performs following steps:
1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
2. Calculate box locations as proposals candidates.
3. Clip boxes to image
4. Remove predicted boxes with small area.
5. Apply NMS to get final proposals as output.
Args:
scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
N is batch size, A is number of anchors, H and W are height and width of the feature map
.
bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location.
im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
between origin image size and the size of feature map.
anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format
.
pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by defaul
t.
post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1
000 by default.
nms_thresh(float): Threshold in NMS, 0.5
by default.
min_size(float): Remove predicted boxes with either height or width < min_size. 0.1
by default.
eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration
.
"""
helper
=
LayerHelper
(
'generate_proposals'
,
**
locals
())
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
48324c32
...
...
@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
is determined by argument buf_size.
Args:
param reader: the original reader whose output will be shuffled.
type reader: callable
param buf_size: shuffle buffer size.
type buf_size: int
return: the new reader whose output is shuffled.
rtype: callable
reader(callable): the original reader whose output will be shuffled.
buf_size(int): shuffle buffer size.
Returns:
callable: the new reader whose output is shuffled.
"""
return
__create_unshared_decorated_reader__
(
'create_shuffle_reader'
,
reader
,
{
'buffer_size'
:
int
(
buffer_size
)})
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
48324c32
此差异已折叠。
点击以展开。
python/paddle/fluid/layers/tensor.py
浏览文件 @
48324c32
...
...
@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input,
It also sets *stop_gradient* to True.
>>> data = fluid.layers.fill_constant_batch_size_like(
>>> input=like, shape=[1], value=0, dtype='int64')
Args:
input(${input_type}): ${input_comment}.
...
...
@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input,
Returns:
${out_comment}.
Examples:
.. code-block:: python
data = fluid.layers.fill_constant_batch_size_like(
input=like, shape=[1], value=0, dtype='int64')
"""
helper
=
LayerHelper
(
"fill_constant_batch_size_like"
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
dtype
)
...
...
python/paddle/fluid/metrics.py
浏览文件 @
48324c32
...
...
@@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase):
Accumulate counter numbers output by chunk_eval from mini-batches and
compute the precision recall and F1-score using the accumulated counter
numbers.
For some basics of chunking, please refer to
'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'
.
For some basics of chunking, please refer to
`Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_
.
ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
...
...
@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
def
update
(
self
,
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
):
"""
Update the states based on the layers.chunk_eval() ouputs.
Args:
num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
...
...
@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
distance, instance_error = distance_evaluator.eval()
In the above example:
'distance' is the average of the edit distance in a pass.
'instance_error' is the instance error rate in a pass.
- 'distance' is the average of the edit distance in a pass.
- 'instance_error' is the instance error rate in a pass.
"""
...
...
@@ -567,12 +568,15 @@ class DetectionMAP(object):
Calculate the detection mean average precision (mAP).
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'.
Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
Args:
...
...
@@ -613,10 +617,12 @@ class DetectionMAP(object):
for data in batches:
loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
In the above example:
In the above example:
- 'cur_map_v' is the mAP of current mini-batch.
- 'accum_map_v' is the accumulative mAP of one pass.
'cur_map_v' is the mAP of current mini-batch.
'accum_map_v' is the accumulative mAP of one pass.
"""
def
__init__
(
self
,
...
...
python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
浏览文件 @
48324c32
...
...
@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest):
self
.
activation
=
'relu'
self
.
add_bias
=
True
self
.
add_residual_data
=
True
self
.
channels
=
None
self
.
outputs
=
None
self
.
init_group
()
self
.
init_dilation
()
...
...
@@ -49,8 +51,8 @@ class TestConv2dFusionOp(OpTest):
input
=
np
.
random
.
random
(
self
.
input_size
).
astype
(
self
.
dtype
)
filter
=
np
.
random
.
random
(
self
.
filter_size
).
astype
(
self
.
dtype
)
output
=
conv2d_forward_naive
(
input
,
filter
,
self
.
groups
,
conv2d_param
).
astype
(
self
.
dtype
)
self
.
output
=
conv2d_forward_naive
(
input
,
filter
,
self
.
groups
,
conv2d_param
).
astype
(
self
.
dtype
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
),
...
...
@@ -58,19 +60,20 @@ class TestConv2dFusionOp(OpTest):
}
if
self
.
add_residual_data
:
residual_data
=
np
.
random
.
random
(
output
.
shape
).
astype
(
self
.
dtype
)
residual_data
=
np
.
random
.
random
(
self
.
output
.
shape
).
astype
(
self
.
dtype
)
self
.
inputs
[
'ResidualData'
]
=
OpTest
.
np_dtype_to_fluid_dtype
(
residual_data
)
output
+=
residual_data
self
.
output
+=
residual_data
if
self
.
add_bias
:
bias
=
np
.
random
.
random
(
self
.
filter_size
[
0
]).
astype
(
self
.
dtype
)
self
.
inputs
[
'Bias'
]
=
OpTest
.
np_dtype_to_fluid_dtype
(
bias
)
output
=
output
+
bias
.
reshape
((
1
,
bias
.
size
,
1
,
1
))
self
.
output
=
self
.
output
+
bias
.
reshape
((
1
,
bias
.
size
,
1
,
1
))
assert
self
.
activation
in
[
'relu'
,
'identity'
]
if
self
.
activation
==
'relu'
:
output
=
np
.
maximum
(
output
,
0
)
self
.
output
=
np
.
maximum
(
self
.
output
,
0
)
self
.
attrs
=
{
'strides'
:
self
.
stride
,
...
...
@@ -79,9 +82,12 @@ class TestConv2dFusionOp(OpTest):
'dilations'
:
self
.
dilations
,
'data_format'
:
self
.
data_format
,
'exhaustive_search'
:
self
.
exhaustive_search
,
'activation'
:
self
.
activation
'activation'
:
self
.
activation
,
'split_channels'
:
self
.
channels
}
self
.
outputs
=
{
'Output'
:
output
}
self
.
outputs
=
{
'Output'
:
self
.
output
}
self
.
set_outputs
()
def
testcuda
(
self
):
return
core
.
is_compiled_with_cuda
()
...
...
@@ -117,6 +123,9 @@ class TestConv2dFusionOp(OpTest):
def
set_search_method
(
self
):
self
.
exhaustive_search
=
False
def
set_outputs
(
self
):
pass
class
TestWithoutResidual
(
TestConv2dFusionOp
):
def
init_bias_residual
(
self
):
...
...
@@ -160,5 +169,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
self
.
exhaustive_search
=
True
class
TestMultipleOutputs
(
TestConv2dFusionOp
):
def
init_test_case
(
self
):
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
1
,
32
,
17
,
17
]
# NCHW
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
126
,
f_c
,
3
,
3
]
self
.
channels
=
[
84
,
42
]
def
set_outputs
(
self
):
out1
=
self
.
output
[:,
0
:
84
,
:,
:]
out2
=
self
.
output
[:,
84
:
126
,
:,
:]
self
.
outputs
[
'Outputs'
]
=
[(
'out1'
,
out1
),
(
'out2'
,
out2
)]
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
48324c32
...
...
@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase):
pool
,
mask
=
layers
.
adaptive_pool2d
(
x
,
[
3
,
3
],
require_index
=
True
)
self
.
assertIsNotNone
(
pool
)
self
.
assertIsNotNone
(
mask
)
self
.
assertIsNotNone
(
layers
.
adaptive_pool2d
(
x
,
3
,
pool_type
=
'avg'
))
pool
,
mask
=
layers
.
adaptive_pool2d
(
x
,
3
,
require_index
=
True
)
self
.
assertIsNotNone
(
pool
)
self
.
assertIsNotNone
(
mask
)
def
test_adaptive_pool3d
(
self
):
program
=
Program
()
...
...
@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase):
x
,
[
3
,
3
,
3
],
require_index
=
True
)
self
.
assertIsNotNone
(
pool
)
self
.
assertIsNotNone
(
mask
)
self
.
assertIsNotNone
(
layers
.
adaptive_pool3d
(
x
,
3
,
pool_type
=
'avg'
))
pool
,
mask
=
layers
.
adaptive_pool3d
(
x
,
3
,
require_index
=
True
)
self
.
assertIsNotNone
(
pool
)
self
.
assertIsNotNone
(
mask
)
def
test_lstm_unit
(
self
):
program
=
Program
()
...
...
python/paddle/fluid/tests/unittests/testsuite.py
浏览文件 @
48324c32
...
...
@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
var_dict
=
{}
for
var_proto
in
proto_list
:
var_name
=
str
(
var_proto
.
name
)
if
(
var_name
not
in
np_list
)
and
var_proto
.
dispensable
:
continue
if
is_input
:
if
(
var_name
not
in
np_list
)
and
var_proto
.
dispensable
:
continue
assert
(
var_name
in
np_list
)
or
(
var_proto
.
dispensable
),
\
"Missing {} as input"
.
format
(
var_name
)
if
var_proto
.
duplicable
:
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
48324c32
...
...
@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):
class
DistributeTranspilerConfig
(
object
):
"""
Args:
slice_var_up (bool): Do Tensor slice for pservers, default is True.
split_method (PSDispatcher): RoundRobin or HashName can be used
try to choose the best method to balance loads for pservers.
min_block_size (int): Minimum splitted element number in block.
According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
.. py:attribute:: slice_var_up (bool)
Do Tensor slice for pservers, default is True.
.. py:attribute:: split_method (PSDispatcher)
RoundRobin or HashName can be used.
Try to choose the best method to balance loads for pservers.
.. py:attribute:: min_block_size (int)
Minimum number of splitted elements in block.
According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
We can use bandwidth effiently when data size is larger than 2MB.If you
want to change it, please be sure you see the slice_variable function.
want to change it, please be sure you have read the slice_variable function.
"""
slice_var_up
=
True
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录