Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
5eb87506
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5eb87506
编写于
2月 21, 2019
作者:
X
Xin Pan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add per kernel config and remove const_cast.
test=develop
上级
a7e7d952
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
205 addition
and
186 deletion
+205
-186
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+9
-2
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+127
-2
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+0
-5
paddle/fluid/imperative/layer.cc
paddle/fluid/imperative/layer.cc
+2
-1
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+3
-2
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+1
-1
paddle/fluid/operators/beam_search_decode_op.cc
paddle/fluid/operators/beam_search_decode_op.cc
+1
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+14
-45
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+1
-95
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+9
-22
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+34
-5
paddle/fluid/platform/temporary_allocator_test.cc
paddle/fluid/platform/temporary_allocator_test.cc
+4
-4
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+0
-1
未找到文件。
paddle/fluid/framework/operator.cc
浏览文件 @
5eb87506
...
...
@@ -921,7 +921,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
));
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -940,6 +940,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
KernelTypeToString
(
expected_kernel_key
));
}
auto
config_iter
=
kernel_configs_map_
.
find
(
expected_kernel_key
);
std
::
vector
<
KernelConfig
>*
kernel_configs
=
nullptr
;
if
(
config_iter
!=
kernel_configs_map_
.
end
())
{
kernel_configs
=
&
(
config_iter
->
second
);
}
// do data transformScope &transfer_scope;
std
::
vector
<
std
::
string
>
transfered_inplace_vars
;
auto
*
transfer_scope
=
...
...
@@ -957,7 +963,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
this
->
InferShape
(
&
infer_shape_ctx
);
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
));
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
,
kernel_configs
));
if
(
!
transfered_inplace_vars
.
empty
())
{
// there is inplace variable has been transfered.
...
...
paddle/fluid/framework/operator.h
浏览文件 @
5eb87506
...
...
@@ -184,12 +184,125 @@ class OperatorBase {
const
platform
::
Place
&
place
)
const
=
0
;
};
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
std
::
mutex
mutex_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
framework
::
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
#ifdef PADDLE_WITH_CUDA
using
KernelConfig
=
boost
::
variant
<
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>>
;
#else
using
KernelConfig
=
boost
::
variant
<
boost
::
blank
>
;
#endif
using
OpKernelConfigsMap
=
std
::
unordered_map
<
OpKernelType
,
std
::
vector
<
KernelConfig
>
,
OpKernelType
::
Hash
>
;
class
ExecutionContext
{
public:
ExecutionContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
device_context
,
const
RuntimeContext
&
ctx
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
)
{}
const
RuntimeContext
&
ctx
,
std
::
vector
<
KernelConfig
>*
configs
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
),
kernel_configs_
(
configs
)
{}
const
OperatorBase
&
op
()
const
{
return
op_
;
}
...
...
@@ -398,11 +511,20 @@ class ExecutionContext {
return
temp_tensor
;
}
template
<
typename
T
>
T
&
GetKernelConfig
(
int
idx
)
const
{
PADDLE_ENFORCE
(
kernel_configs_
&&
kernel_configs_
->
size
()
>
idx
,
"%s selected kernel doesn't have kernel config %lu <= %d"
,
op_
.
Type
().
c_str
(),
kernel_configs_
->
size
(),
idx
);
return
*
boost
::
get
<
std
::
shared_ptr
<
T
>>
(
kernel_configs_
->
at
(
idx
));
}
private:
const
OperatorBase
&
op_
;
const
Scope
&
scope_
;
const
platform
::
DeviceContext
&
device_context_
;
const
RuntimeContext
&
ctx_
;
mutable
std
::
vector
<
KernelConfig
>*
kernel_configs_
;
};
template
<>
...
...
@@ -508,6 +630,9 @@ class OperatorWithKernel : public OperatorBase {
void
TransferInplaceVarsBack
(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
Scope
&
exec_scope
)
const
;
protected:
mutable
OpKernelConfigsMap
kernel_configs_map_
;
};
extern
bool
OpSupportGPU
(
const
std
::
string
&
op_type
);
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
5eb87506
...
...
@@ -50,8 +50,6 @@ class Scope;
}
// namespace framework
namespace
operators
{
template
<
typename
T
>
class
AlgorithmsCache
;
class
CudnnRNNCache
;
...
...
@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#ifndef _WIN32
ncclUniqueId
,
platform
::
Communicator
,
#endif
operators
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
,
operators
::
CudnnRNNCache
,
#endif
int
,
float
>
;
...
...
paddle/fluid/imperative/layer.cc
浏览文件 @
5eb87506
...
...
@@ -249,7 +249,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
framework
::
Scope
scope
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place_
);
p
.
op
.
RuntimeInferShape
(
scope
,
place_
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
,
nullptr
));
}
}
...
...
paddle/fluid/imperative/layer.h
浏览文件 @
5eb87506
...
...
@@ -64,8 +64,9 @@ class PreparedOp {
framework
::
OperatorWithKernel
::
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
op
.
GetExpectedKernelType
(
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
));
auto
expected_kernel_key
=
op
.
GetExpectedKernelType
(
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
5eb87506
...
...
@@ -139,7 +139,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
prepared_op
.
func
(
framework
::
ExecutionContext
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
));
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
,
nullptr
));
if
(
!
stop_gradient
)
{
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
...
...
paddle/fluid/operators/beam_search_decode_op.cc
浏览文件 @
5eb87506
...
...
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
framework
::
RuntimeContext
run_ctx
(
Inputs
(),
Outputs
(),
scope
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
,
nullptr
);
const
LoDTensorArray
*
ids
=
ctx
.
Input
<
LoDTensorArray
>
(
"Ids"
);
const
LoDTensorArray
*
scores
=
ctx
.
Input
<
LoDTensorArray
>
(
"Scores"
);
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
5eb87506
...
...
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using
DataLayout
=
platform
::
DataLayout
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
if
(
exhaustive_search
&&
(
!
half_float
))
{
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
...
...
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
algo
=
algo_cache
->
GetAlgorithm
(
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
...
...
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>*
data_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdDataAlgoCache
))
{
data_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
else
{
data_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
data_algo
=
data_algo_cache
->
GetAlgorithm
(
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>&
data_algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
(
0
);
data_algo
=
data_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
...
...
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>*
f_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdFilterAlgoCache
))
{
f_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
else
{
f_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
filter_algo
=
f_algo_cache
->
GetAlgorithm
(
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>&
f_algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
(
1
);
filter_algo
=
f_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
...
...
paddle/fluid/operators/conv_cudnn_op_cache.h
浏览文件 @
5eb87506
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <functional>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
...
...
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static
constexpr
size_t
kNUM_CUDNN_BWD_DATA_ALGS
=
5
;
#endif
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
std
::
mutex
mutex_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
5eb87506
...
...
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
ScopedActivationDescriptor
=
platform
::
ScopedActivationDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
...
...
@@ -139,38 +141,23 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
}
return
fwd_perf_stat
[
0
].
algo
;
};
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
int
search_times
=
ctx
.
Attr
<
int
>
(
"search_times"
);
search_times
=
std
::
max
(
static_cast
<
int
>
(
FLAGS_cudnn_exhaustive_search_times
),
search_times
);
// TODO(dangqingqing): Unify this if-else.
if
(
search_times
>
0
)
{
// The searched algo will be cached by `search_times` times for
// different input dimension. For other dimensions, select the algo
// of closest area.
auto
var_name
=
ctx
.
Inputs
(
"AlgoCache"
)[
0
];
algo_cache
=
ctx
.
scope
()
.
FindVar
(
var_name
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
search_func
);
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
search_func
);
}
else
{
// Cache searched algo in Var(kCUDNNFwdAlgoCache).
// all conv ops use the same kCUDNNFwdAlgoCache variable.
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
// TODO(qingqing) remove const_cast
algo_cache
=
const_cast
<
framework
::
Scope
*>
(
ctx
.
scope
().
parent
())
->
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
search_func
);
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
search_func
);
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
}
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
5eb87506
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
...
...
@@ -109,8 +110,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"float16 can only be used when CUDNN is used"
);
}
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
,
customized_type_value
);
auto
type
=
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
,
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
// TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
// to false. It should be fixed and then here should only create if library
// is kCUDNN.
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
());
configs
.
push_back
(
p
);
}
#endif
return
type
;
}
void
Conv2DOpMaker
::
Make
()
{
...
...
@@ -410,9 +423,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
}
#endif
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
,
customized_type_value
);
auto
type
=
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
,
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
if
(
library_
==
framework
::
LibraryType
::
kCUDNN
)
{
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
());
configs
.
push_back
(
p
);
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
p2
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
());
configs
.
push_back
(
p2
);
}
}
#endif
return
type
;
}
class
Conv2dGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
...
...
paddle/fluid/platform/temporary_allocator_test.cc
浏览文件 @
5eb87506
...
...
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
...
...
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CUDADeviceContext
>
(
...
...
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
out_side_tensor
;
...
...
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
size_t
memory_size
=
500
;
int
numel
=
memory_size
/
sizeof
(
float
);
...
...
python/paddle/fluid/framework.py
浏览文件 @
5eb87506
...
...
@@ -723,7 +723,6 @@ class Operator(object):
self
.
_update_desc_attr
(
attr_name
,
attr_val
)
self
.
desc
.
check_attrs
()
if
self
.
_has_kernel
(
type
):
self
.
desc
.
infer_var_type
(
self
.
block
.
desc
)
self
.
desc
.
infer_shape
(
self
.
block
.
desc
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录