Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e06f2f33
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e06f2f33
编写于
10月 09, 2018
作者:
S
shippingwang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'release/1.0.0' of
https://github.com/PaddlePaddle/Paddle
into release/1.0.0
上级
3023eccb
627bea4f
变更
14
显示空白变更内容
内联
并排
Showing
14 changed file
with
220 addition
and
831 deletion
+220
-831
cmake/flags.cmake
cmake/flags.cmake
+6
-2
paddle/fluid/framework/details/reference_count_pass.cc
paddle/fluid/framework/details/reference_count_pass.cc
+9
-9
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+7
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+0
-1
paddle/fluid/operators/conv_op.h
paddle/fluid/operators/conv_op.h
+3
-4
paddle/fluid/operators/conv_transpose_op.h
paddle/fluid/operators/conv_transpose_op.h
+3
-4
paddle/fluid/operators/cub_reduce.h
paddle/fluid/operators/cub_reduce.h
+0
-322
paddle/fluid/operators/math/depthwise_conv.cu
paddle/fluid/operators/math/depthwise_conv.cu
+156
-323
paddle/fluid/operators/math/depthwise_conv.h
paddle/fluid/operators/math/depthwise_conv.h
+1
-4
paddle/fluid/operators/reduce_mean_op.cu
paddle/fluid/operators/reduce_mean_op.cu
+9
-56
paddle/fluid/operators/reduce_sum_op.cu
paddle/fluid/operators/reduce_sum_op.cu
+9
-51
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+7
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+3
-3
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+7
-52
未找到文件。
cmake/flags.cmake
浏览文件 @
e06f2f33
...
@@ -138,13 +138,17 @@ if (APPLE)
...
@@ -138,13 +138,17 @@ if (APPLE)
# On Mac OS X build fat binaries with x86_64 architectures by default.
# On Mac OS X build fat binaries with x86_64 architectures by default.
set
(
CMAKE_OSX_ARCHITECTURES
"x86_64"
CACHE STRING
"Build architectures for OSX"
FORCE
)
set
(
CMAKE_OSX_ARCHITECTURES
"x86_64"
CACHE STRING
"Build architectures for OSX"
FORCE
)
endif
()
endif
()
else
()
# On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
set
(
COMMON_FLAGS -Wno-deprecated-register
)
endif
(
APPLE
)
if
(
LINUX
)
set
(
GPU_COMMON_FLAGS
set
(
GPU_COMMON_FLAGS
-Wall
-Wall
-Wextra
-Wextra
-Werror
-Werror
${
GPU_COMMON_FLAGS
}
)
${
GPU_COMMON_FLAGS
}
)
endif
()
endif
(
LINUX
)
if
(
UNIX AND NOT APPLE
)
if
(
UNIX AND NOT APPLE
)
# except apple from nix*Os family
# except apple from nix*Os family
...
...
paddle/fluid/framework/details/reference_count_pass.cc
浏览文件 @
e06f2f33
...
@@ -80,16 +80,16 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
...
@@ -80,16 +80,16 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
// This is weird but there is really some variables without var_desc
// This is weird but there is really some variables without var_desc
// in computation_op
// in computation_op
if
(
var_desc
==
nullptr
)
{
if
(
var_desc
==
nullptr
)
{
if
(
compute_op
->
Node
()
->
Op
()
->
Block
()
->
FindVar
(
var_name
)
==
nullptr
)
var_desc
=
compute_op
->
Node
()
->
Op
()
->
Block
()
->
FindVar
(
var_name
);
continue
;
if
(
var_desc
==
nullptr
)
continue
;
}
else
{
}
if
(
var_desc
->
Persistable
())
continue
;
if
(
var_desc
->
Persistable
())
continue
;
auto
var_type
=
var_desc
->
Proto
()
->
type
().
type
();
auto
var_type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
var_type
!=
proto
::
VarType
::
LOD_TENSOR
&&
if
(
var_type
!=
proto
::
VarType
::
LOD_TENSOR
&&
var_type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
var_type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
continue
;
continue
;
}
}
}
// compute op only runs in one device
// compute op only runs in one device
if
(
ref_cnts
[
place
.
device
]
->
count
(
var_name
))
if
(
ref_cnts
[
place
.
device
]
->
count
(
var_name
))
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
e06f2f33
...
@@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
...
@@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
!
gcs_
.
empty
())
{
if
(
!
gcs_
.
empty
())
{
ResetReferenceCount
();
ResetReferenceCount
();
for
(
auto
&
pair
:
cur_ref_cnts_
)
{
auto
&
name_map
=
*
(
pair
.
second
);
for
(
auto
&
fetch_name
:
fetch_tensors
)
{
name_map
.
erase
(
fetch_name
);
}
name_map
.
erase
(
fetched_var_name
);
}
}
}
#endif
#endif
auto
fetch_data
=
member_
->
executor_
->
Run
(
fetch_tensors
);
auto
fetch_data
=
member_
->
executor_
->
Run
(
fetch_tensors
);
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
e06f2f33
...
@@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
...
@@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
layer_norm_op DEPS cub
)
op_library
(
layer_norm_op DEPS cub
)
op_library
(
reduce_mean_op DEPS cub
)
else
()
else
()
op_library
(
conv_op DEPS vol2col im2col
)
op_library
(
conv_op DEPS vol2col im2col
)
endif
()
endif
()
...
...
paddle/fluid/operators/conv_op.h
浏览文件 @
e06f2f33
...
@@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
...
@@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
>
depthwiseConv
;
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
>
depthwiseConv
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
depthwiseConv
(
dev_ctx
,
*
input
,
filter
,
strides
,
paddings
,
dilations
,
depthwiseConv
(
dev_ctx
,
*
input
,
filter
,
strides
,
paddings
,
output
);
output
);
}
}
};
};
...
@@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
...
@@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
depthwiseConvInputGrad
(
dev_ctx
,
*
input
,
filter
,
*
output_grad
,
strides
,
depthwiseConvInputGrad
(
dev_ctx
,
*
input
,
filter
,
*
output_grad
,
strides
,
paddings
,
dilations
,
input_grad
);
paddings
,
input_grad
);
}
}
if
(
filter_grad
)
{
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
filter_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
set_zero
(
dev_ctx
,
filter_grad
,
static_cast
<
T
>
(
0
));
set_zero
(
dev_ctx
,
filter_grad
,
static_cast
<
T
>
(
0
));
depthwiseConvFilterGrad
(
dev_ctx
,
*
input
,
*
output_grad
,
strides
,
paddings
,
depthwiseConvFilterGrad
(
dev_ctx
,
*
input
,
*
output_grad
,
strides
,
paddings
,
dilations
,
filter_grad
);
filter_grad
);
}
}
}
}
};
};
...
...
paddle/fluid/operators/conv_transpose_op.h
浏览文件 @
e06f2f33
...
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
...
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
math
::
DepthwiseConvInputGradFunctor
<
DeviceContext
,
T
>
math
::
DepthwiseConvInputGradFunctor
<
DeviceContext
,
T
>
depthwiseConvInputGrad
;
depthwiseConvInputGrad
;
depthwiseConvInputGrad
(
dev_ctx
,
*
output
,
filter
,
*
input
,
strides
,
paddings
,
depthwiseConvInputGrad
(
dev_ctx
,
*
output
,
filter
,
*
input
,
strides
,
paddings
,
dilations
,
output
);
output
);
}
}
};
};
...
@@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
...
@@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
if
(
input_grad
)
{
if
(
input_grad
)
{
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
>
depthwiseConv
;
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
>
depthwiseConv
;
depthwiseConv
(
dev_ctx
,
*
output_grad
,
filter
,
strides
,
paddings
,
dilations
,
depthwiseConv
(
dev_ctx
,
*
output_grad
,
filter
,
strides
,
paddings
,
input_grad
);
input_grad
);
}
}
...
@@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
...
@@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
math
::
DepthwiseConvFilterGradFunctor
<
DeviceContext
,
T
>
math
::
DepthwiseConvFilterGradFunctor
<
DeviceContext
,
T
>
depthwiseConvFilterGrad
;
depthwiseConvFilterGrad
;
depthwiseConvFilterGrad
(
dev_ctx
,
*
output_grad
,
*
input
,
strides
,
paddings
,
depthwiseConvFilterGrad
(
dev_ctx
,
*
output_grad
,
*
input
,
strides
,
paddings
,
dilations
,
filter_grad
);
filter_grad
);
}
}
}
}
};
};
...
...
paddle/fluid/operators/cub_reduce.h
已删除
100644 → 0
浏览文件 @
3023eccb
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
detail
{
template
<
typename
T
,
size_t
ElementCount
>
struct
Array
{
public:
HOSTDEVICE
inline
Array
()
{}
HOSTDEVICE
inline
T
&
operator
[](
size_t
index
)
{
return
data_
[
index
];
}
HOSTDEVICE
inline
const
T
&
operator
[](
size_t
index
)
const
{
return
data_
[
index
];
}
HOSTDEVICE
constexpr
inline
size_t
size
()
const
{
return
ElementCount
;
}
template
<
typename
VectorLikeType
>
static
inline
Array
<
T
,
ElementCount
>
From
(
const
VectorLikeType
&
vec
)
{
PADDLE_ENFORCE_EQ
(
vec
.
size
(),
ElementCount
,
"size not match"
);
size_t
n
=
static_cast
<
size_t
>
(
vec
.
size
());
Array
<
T
,
ElementCount
>
ret
;
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
ret
[
i
]
=
vec
[
i
];
return
ret
;
}
private:
T
data_
[
ElementCount
];
};
// reduce the last axis of 2d array
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
,
int
BlockDim
>
__global__
void
ReduceKernel2D
(
const
Tx
*
x
,
Ty
*
y
,
ReduceOp
reducer
,
TransformOp
transformer
,
Ty
init
,
int
reduce_num
)
{
__shared__
typename
cub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
int
idx_x
=
blockIdx
.
x
*
reduce_num
;
int
idx_y
=
threadIdx
.
x
;
Ty
reduce_var
=
init
;
for
(
int
idx_y
=
threadIdx
.
x
;
idx_y
<
reduce_num
;
idx_y
+=
BlockDim
)
reduce_var
=
reducer
(
reduce_var
,
transformer
(
x
[
idx_x
+
idx_y
]));
reduce_var
=
cub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
).
Reduce
(
reduce_var
,
reducer
);
if
(
threadIdx
.
x
==
0
)
{
y
[
blockIdx
.
x
]
=
reduce_var
;
}
}
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
,
int
BlockDim
,
int
Rank
,
int
ReduceRank
>
__global__
void
ReduceKernel
(
const
Tx
*
x
,
Ty
*
y
,
ReduceOp
reducer
,
TransformOp
transformer
,
Ty
init
,
int
reduce_num
,
Array
<
int
,
Rank
>
x_strides
,
Array
<
int
,
ReduceRank
>
reduce_dim
,
Array
<
int
,
ReduceRank
>
reduce_strides
,
Array
<
int
,
Rank
-
ReduceRank
>
left_dim
,
Array
<
int
,
Rank
-
ReduceRank
>
left_strides
)
{
__shared__
typename
cub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
Array
<
int
,
Rank
>
sub_index
;
int
left_idx
=
blockIdx
.
x
;
for
(
int
i
=
0
;
i
<
Rank
-
ReduceRank
;
++
i
)
{
sub_index
[
left_dim
[
i
]]
=
left_idx
/
left_strides
[
i
];
left_idx
%=
left_strides
[
i
];
}
int
reduce_idx
=
threadIdx
.
x
;
for
(
int
j
=
0
;
j
<
ReduceRank
;
++
j
)
{
sub_index
[
reduce_dim
[
j
]]
=
reduce_idx
/
reduce_strides
[
j
];
reduce_idx
%=
reduce_strides
[
j
];
}
int
idx_x
=
0
;
for
(
int
k
=
0
;
k
<
Rank
;
++
k
)
idx_x
+=
(
sub_index
[
k
]
*
x_strides
[
k
]);
Ty
reduce_var
=
static_cast
<
Ty
>
(
transformer
(
x
[
idx_x
]));
for
(
int
i
=
threadIdx
.
x
+
BlockDim
;
i
<
reduce_num
;
i
+=
BlockDim
)
{
int
reduce_idx
=
i
;
for
(
int
j
=
0
;
j
<
ReduceRank
;
++
j
)
{
sub_index
[
reduce_dim
[
j
]]
=
reduce_idx
/
reduce_strides
[
j
];
reduce_idx
%=
reduce_strides
[
j
];
}
int
idx_x
=
0
;
for
(
int
k
=
0
;
k
<
Rank
;
++
k
)
idx_x
+=
(
sub_index
[
k
]
*
x_strides
[
k
]);
reduce_var
=
static_cast
<
Ty
>
(
reducer
(
reduce_var
,
transformer
(
x
[
idx_x
])));
}
reduce_var
=
cub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
).
Reduce
(
reduce_var
,
reducer
);
if
(
threadIdx
.
x
==
0
)
{
y
[
blockIdx
.
x
]
=
reduce_var
;
}
}
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>&
dims
)
{
int
n
=
static_cast
<
int
>
(
dims
.
size
());
if
(
n
==
0
)
return
std
::
vector
<
int
>
();
std
::
vector
<
int
>
strides
(
n
);
strides
.
back
()
=
1
;
for
(
int
i
=
n
-
2
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
dims
[
i
+
1
];
}
return
strides
;
}
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>&
dims
,
const
std
::
vector
<
int
>&
idx
)
{
int
n
=
static_cast
<
int
>
(
idx
.
size
());
if
(
n
==
0
)
return
std
::
vector
<
int
>
();
std
::
vector
<
int
>
strides
(
n
);
strides
.
back
()
=
1
;
for
(
int
i
=
n
-
2
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
dims
[
idx
[
i
+
1
]];
}
return
strides
;
}
constexpr
int
kMaxBlockDim
=
512
;
static
inline
int
GetDesiredBlockDim
(
int
block_dim
)
{
return
block_dim
>=
kMaxBlockDim
?
kMaxBlockDim
:
(
1
<<
static_cast
<
int
>
(
std
::
log2
(
block_dim
)));
}
template
<
typename
Tx
,
typename
Ty
,
int
BlockDim
,
typename
ReduceOp
,
typename
TransformOp
>
static
void
TensorReduceImpl
(
const
Tx
*
x_data
,
Ty
*
y_data
,
const
platform
::
Place
&
place
,
const
ReduceOp
&
reducer
,
const
TransformOp
&
transformer
,
const
Ty
&
init
,
int
left_num
,
int
reduce_num
,
const
std
::
vector
<
int
>&
x_strides
,
const
std
::
vector
<
int
>&
reduce_dim
,
const
std
::
vector
<
int
>&
reduce_strides
,
const
std
::
vector
<
int
>&
left_dim
,
const
std
::
vector
<
int
>&
left_strides
,
cudaStream_t
stream
)
{
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, \
kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
x_data, y_data, reducer, transformer, init, reduce_num, \
Array<int, kRank>::From(x_strides), \
Array<int, kReduceRank>::From(reduce_dim), \
Array<int, kReduceRank>::From(reduce_strides), \
Array<int, kRank - kReduceRank>::From(left_dim), \
Array<int, kRank - kReduceRank>::From(left_strides)); \
} break
int
rank
=
x_strides
.
size
();
int
reduce_rank
=
reduce_strides
.
size
();
if
(
rank
==
reduce_rank
)
{
cub
::
TransformInputIterator
<
Ty
,
TransformOp
,
const
Tx
*>
trans_x
(
x_data
,
transformer
);
size_t
temp_storage_bytes
=
0
;
cub
::
DeviceReduce
::
Reduce
(
nullptr
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
framework
::
Tensor
tmp
;
auto
*
temp_storage
=
tmp
.
mutable_data
<
uint8_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
temp_storage_bytes
)}),
place
);
cub
::
DeviceReduce
::
Reduce
(
temp_storage
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
return
;
}
if
(
rank
==
2
&&
reduce_rank
==
1
&&
reduce_dim
[
0
]
==
1
)
{
ReduceKernel2D
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
,
BlockDim
><<<
left_num
,
BlockDim
,
0
,
stream
>>>
(
x_data
,
y_data
,
reducer
,
transformer
,
init
,
reduce_num
);
return
;
}
/*
if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
// TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
// Currently, it is handled by code below, but inefficient
return;
}
*/
switch
(
rank
)
{
CUB_RANK_CASE
(
2
,
CUB_REDUCE_RANK_CASE
(
1
););
CUB_RANK_CASE
(
3
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
););
CUB_RANK_CASE
(
4
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
););
CUB_RANK_CASE
(
5
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
););
CUB_RANK_CASE
(
6
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
););
CUB_RANK_CASE
(
7
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
);
CUB_REDUCE_RANK_CASE
(
6
););
CUB_RANK_CASE
(
8
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
);
CUB_REDUCE_RANK_CASE
(
6
););
CUB_RANK_CASE
(
9
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
);
CUB_REDUCE_RANK_CASE
(
6
);
CUB_REDUCE_RANK_CASE
(
7
);
CUB_REDUCE_RANK_CASE
(
8
););
}
#undef CUB_REDUCE_RANK_CASE
#undef CUB_RANK_CASE
}
}
// namespace detail
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
>
void
TensorReduce
(
const
framework
::
Tensor
&
x
,
framework
::
Tensor
*
y
,
std
::
vector
<
int
>
origin_reduce_dims
,
const
Ty
&
init
,
const
ReduceOp
&
reducer
,
const
TransformOp
&
transformer
,
cudaStream_t
stream
)
{
auto
x_dim
=
framework
::
vectorize2int
(
x
.
dims
());
std
::
vector
<
int
>
new_x_dim
,
new_reduce_dims
;
int
is_reduced
=
0
;
for
(
auto
e
:
origin_reduce_dims
)
{
auto
pos
=
e
>=
0
?
e
:
e
+
x_dim
.
size
();
is_reduced
|=
1
<<
e
;
}
for
(
int
i
=
0
;
i
<
x_dim
.
size
();
i
++
)
{
if
((
i
==
0
)
||
(((
is_reduced
>>
i
)
^
(
is_reduced
>>
(
i
-
1
)))
&
1
))
{
new_x_dim
.
push_back
(
x_dim
[
i
]);
if
((
is_reduced
>>
i
)
&
1
)
new_reduce_dims
.
push_back
(
new_x_dim
.
size
()
-
1
);
}
else
{
new_x_dim
[
new_x_dim
.
size
()
-
1
]
*=
x_dim
[
i
];
}
}
x_dim
=
new_x_dim
;
origin_reduce_dims
=
new_reduce_dims
;
int
x_rank
=
static_cast
<
int
>
(
x_dim
.
size
());
std
::
set
<
int
>
left_set
,
reduce_set
;
for
(
int
i
=
0
;
i
<
x_rank
;
++
i
)
left_set
.
insert
(
i
);
for
(
auto
e
:
origin_reduce_dims
)
{
left_set
.
erase
(
e
);
reduce_set
.
insert
(
e
);
}
std
::
vector
<
int
>
reduce_dim
(
reduce_set
.
begin
(),
reduce_set
.
end
());
std
::
vector
<
int
>
left_dim
(
left_set
.
begin
(),
left_set
.
end
());
std
::
vector
<
int
>
x_strides
=
detail
::
GetStrides
(
x_dim
);
std
::
vector
<
int
>
reduce_strides
=
detail
::
GetStrides
(
x_dim
,
reduce_dim
);
std
::
vector
<
int
>
left_strides
=
detail
::
GetStrides
(
x_dim
,
left_dim
);
int
reduce_num
=
reduce_strides
[
0
]
*
x_dim
[
reduce_dim
[
0
]];
int
left_num
=
1
;
if
(
left_dim
.
size
())
left_num
=
left_strides
[
0
]
*
x_dim
[
left_dim
[
0
]];
std
::
vector
<
int
>
y_dim
(
left_dim
.
size
());
for
(
int
i
=
0
;
i
<
left_dim
.
size
();
++
i
)
{
y_dim
[
i
]
=
x_dim
[
left_dim
[
i
]];
}
auto
x_data
=
x
.
data
<
Tx
>
();
auto
y_data
=
y
->
mutable_data
<
Ty
>
(
x
.
place
());
if
(
reduce_num
==
1
)
return
;
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, left_num, \
reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \
left_strides, stream); \
} break
switch
(
detail
::
GetDesiredBlockDim
(
reduce_num
))
{
CUB_BLOCK_DIM_CASE
(
512
);
CUB_BLOCK_DIM_CASE
(
256
);
CUB_BLOCK_DIM_CASE
(
128
);
CUB_BLOCK_DIM_CASE
(
64
);
CUB_BLOCK_DIM_CASE
(
32
);
CUB_BLOCK_DIM_CASE
(
16
);
CUB_BLOCK_DIM_CASE
(
8
);
CUB_BLOCK_DIM_CASE
(
4
);
CUB_BLOCK_DIM_CASE
(
2
);
}
#undef CUB_BLOCK_DIM_CASE
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/depthwise_conv.cu
浏览文件 @
e06f2f33
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <algorithm>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/cuda_primitives.h"
...
@@ -21,54 +20,31 @@ namespace paddle {
...
@@ -21,54 +20,31 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
template
<
typename
T
>
__inline__
__device__
T
warpReduceSum
(
T
val
)
{
#if CUDA_VERSION < 9000
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
(
val
,
offset
);
return
val
;
#else
#define FULL_MASK 0xffffffff
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down_sync
(
FULL_MASK
,
val
,
offset
);
return
val
;
#endif
}
__forceinline__
__device__
unsigned
lane_id
()
{
unsigned
ret
;
asm
volatile
(
"mov.u32 %0, %laneid;"
:
"=r"
(
ret
));
return
ret
;
}
__forceinline__
__device__
unsigned
warp_id
()
{
unsigned
ret
;
asm
volatile
(
"mov.u32 %0, %warpid;"
:
"=r"
(
ret
));
return
ret
;
}
// A Cuda kernel to compute the depthwise convolution forward pass
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
// in NCHW format.
template
<
typename
T
>
template
<
typename
T
>
__
device__
__inline
__
void
KernelDepthwiseConv
(
__
global
__
void
KernelDepthwiseConv
(
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
nthreads
,
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
padding_height
,
const
int
padding_width
,
T
*
const
output_data
)
{
const
int
dilate_width
,
T
*
const
output_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
int
w_out
=
threadIdx
.
x
;
w_out
<
output_width
;
w_out
+=
blockDim
.
x
)
{
for
(
int
h_out
=
threadIdx
.
y
;
h_out
<
output_height
;
h_out
+=
blockDim
.
y
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
batch
=
index
/
output_channels
/
output_height
/
output_width
;
const
int
c_out
=
blockIdx
.
x
;
const
int
c_out
=
(
index
/
output_height
/
output_width
)
%
output_channels
;
const
int
h_out
=
(
index
/
output_width
)
%
output_height
;
const
int
w_out
=
index
%
output_width
;
const
int
c_in
=
c_out
/
filter_multiplier
;
const
int
c_in
=
c_out
/
filter_multiplier
;
const
T
*
weight
=
filter_data
+
c_out
*
filter_height
*
filter_width
;
const
T
*
weight
=
filter_data
+
c_out
*
filter_height
*
filter_width
;
T
value
=
0
;
T
value
=
0
;
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
const
int
h_in_end
=
h_in_start
+
filter_height
*
dilate
_height
;
const
int
h_in_end
=
h_in_start
+
filter
_height
;
const
int
w_in_end
=
w_in_start
+
filter_width
*
dilate
_width
;
const
int
w_in_end
=
w_in_start
+
filter
_width
;
const
int
in_offset
=
const
int
in_offset
=
((
batch
*
input_channels
+
c_in
)
*
input_height
)
*
input_width
;
((
batch
*
input_channels
+
c_in
)
*
input_height
)
*
input_width
;
...
@@ -77,212 +53,116 @@ __device__ __inline__ void KernelDepthwiseConv(
...
@@ -77,212 +53,116 @@ __device__ __inline__ void KernelDepthwiseConv(
const
int
w_end
=
w_in_end
<
input_width
?
w_in_end
:
input_width
;
const
int
w_end
=
w_in_end
<
input_width
?
w_in_end
:
input_width
;
const
int
h_start
=
h_in_start
>
0
?
h_in_start
:
0
;
const
int
h_start
=
h_in_start
>
0
?
h_in_start
:
0
;
const
int
w_start
=
w_in_start
>
0
?
w_in_start
:
0
;
const
int
w_start
=
w_in_start
>
0
?
w_in_start
:
0
;
int
weight_offset
=
0
;
for
(
int
h_in
=
h_in_start
;
h_in
<
h_in_end
;
h_in
+=
dilate_height
)
{
for
(
int
h_in
=
h_start
;
h_in
<
h_end
;
h_in
++
)
{
for
(
int
w_in
=
w_in_start
;
w_in
<
w_in_end
;
w_in
+=
dilate_width
)
{
for
(
int
w_in
=
w_start
;
w_in
<
w_end
;
w_in
++
)
{
if
(
h_in
>=
h_start
&&
h_in
<
h_end
&&
w_in
>=
w_start
&&
w_in
<
w_end
)
{
const
int
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
const
int
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
value
+=
weight
[
weight_offset
]
*
input_data
[
offset
];
value
+=
weight
[(
h_in
-
h_in_start
)
*
filter_width
+
(
w_in
-
w_in_start
)]
*
input_data
[
offset
];
}
}
weight_offset
++
;
}
}
}
int
index
=
((
batch
*
gridDim
.
x
+
c_out
)
*
output_height
+
h_out
)
*
output_width
+
w_out
;
output_data
[
index
]
=
value
;
output_data
[
index
]
=
value
;
}
}
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
>
__global__
void
KernelDepthwiseConvSp
(
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
output_data
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
else
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_height
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template
<
typename
T
>
template
<
typename
T
>
__device__
__inline__
void
KernelDepthwiseConvInputGrad
(
__global__
void
KernelDepthwiseConvInputGrad
(
const
T
*
const
output_grad_data
,
const
T
*
const
filter_data
,
const
int
nthreads
,
const
T
*
const
output_grad_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_width
,
T
*
const
input_grad_data
)
{
T
*
const
input_grad_data
)
{
for
(
int
w_in
=
threadIdx
.
x
;
w_in
<
input_width
;
w_in
+=
blockDim
.
x
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
int
h_in
=
threadIdx
.
y
;
h_in
<
input_height
;
h_in
+=
blockDim
.
y
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
batch
=
index
/
input_channels
/
input_height
/
input_width
;
const
int
c_in
=
blockIdx
.
x
;
const
int
c_in
=
(
index
/
input_height
/
input_width
)
%
input_channels
;
const
int
h_in
=
(
index
/
input_width
)
%
input_height
;
const
int
w_in
=
index
%
input_width
;
const
int
c_out_start
=
c_in
*
filter_multiplier
;
const
int
c_out_start
=
c_in
*
filter_multiplier
;
int
h_out_start
=
int
h_out_start
=
h_in
-
(
filter_height
-
1
)
*
dilate_height
+
padding_height
;
(
h_in
-
filter_height
+
padding_height
+
stride_height
)
/
stride_height
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
int
h_out_end
=
h_in
+
padding_height
;
int
h_out_end
=
(
h_in
+
padding_height
)
/
stride_height
;
h_out_end
=
output_height
-
1
<
h_out_end
?
output_height
-
1
:
h_out_end
;
int
w_out_start
=
int
w_out_start
=
w_in
-
(
filter_width
-
1
)
*
dilate_width
+
padding_width
;
(
w_in
-
filter_width
+
padding_width
+
stride_width
)
/
stride_width
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
int
w_out_end
=
w_in
+
padding_width
;
int
w_out_end
=
(
w_in
+
padding_width
)
/
stride_width
;
w_out_end
=
output_width
-
1
<
w_out_end
?
output_width
-
1
:
w_out_end
;
T
value
=
0
;
T
value
=
0
;
for
(
int
c_out
=
c_out_start
;
c_out
<
c_out_start
+
filter_multiplier
;
for
(
int
c_out
=
c_out_start
;
c_out
<
c_out_start
+
filter_multiplier
;
c_out
++
)
{
c_out
++
)
{
int
filter_offset
=
(
c_out
+
1
)
*
filter_height
*
filter_width
;
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
const
int
filter_h
=
h_in
+
padding_height
-
h_out
*
stride_height
;
h_out
+=
dilate_height
)
{
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
const
int
filter_w
=
w_in
+
padding_width
-
w_out
*
stride_width
;
w_out
+=
dilate_width
)
{
const
int
filter_offset
=
c_out
*
filter_height
*
filter_width
+
filter_offset
--
;
filter_h
*
filter_width
+
filter_w
;
int
s_h_out
=
h_out
/
stride_height
;
int
s_w_out
=
w_out
/
stride_width
;
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
const
int
output_grad_offset
=
const
int
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
((
batch
*
output_channels
+
c_out
)
*
output_height
+
h_out
)
*
s_h_out
)
*
output_width
+
output_width
+
s_w_out
;
w_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
value
+=
filter_data
[
filter_offset
];
output_grad_data
[
output_grad_offset
]
*
filter_data
[
filter_offset
];
}
}
}
}
}
}
int
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
input_grad_data
[
index
]
=
value
;
}
}
input_grad_data
[
index
]
+=
value
;
}
}
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
>
__global__
void
KernelDepthwiseConvInputGradSp
(
const
T
*
const
output_grad_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
input_grad_data
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConvInputGrad
<
T
>
(
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
else
KernelDepthwiseConvInputGrad
<
T
>
(
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template
<
typename
T
>
template
<
typename
T
>
__device__
__inline__
void
KernelDepthwiseConvFilterGrad
(
__global__
void
KernelDepthwiseConvFilterGrad
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
nthreads
,
const
T
*
const
output_grad_data
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
T
*
const
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_width
,
T
*
filter_grad_data
)
{
T
*
const
filter_grad_data
)
{
T
s
=
0
;
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
int
gbid
=
((
blockIdx
.
z
*
gridDim
.
y
)
+
blockIdx
.
y
)
*
gridDim
.
x
+
blockIdx
.
x
;
const
int
w_out
=
index
%
output_width
;
int
lid
=
lane_id
();
const
int
h_out
=
(
index
/
output_width
)
%
output_height
;
const
int
c_out
=
(
index
/
output_width
/
output_height
)
%
output_channels
;
for
(
int
image_w
=
threadIdx
.
x
;
image_w
<
output_width
;
const
int
batch
=
(
index
/
output_width
/
output_height
/
output_channels
);
image_w
+=
blockDim
.
x
)
{
const
int
c_in
=
c_out
/
filter_multiplier
;
for
(
int
bid
=
0
;
bid
<
num
;
bid
++
)
{
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
for
(
int
image_h
=
threadIdx
.
y
;
image_h
<
output_height
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
image_h
+=
blockDim
.
y
)
{
const
int
h_in_end
=
int
kernel_id
=
blockIdx
.
z
;
-
padding_height
+
h_out
*
stride_height
+
filter_height
;
int
kernel_h
=
blockIdx
.
y
*
dilate_height
-
padding_height
;
const
int
w_in_end
=
-
padding_width
+
w_out
*
stride_width
+
filter_width
;
int
kernel_w
=
blockIdx
.
x
*
dilate_width
-
padding_width
;
const
int
in_offset
=
(
batch
*
input_channels
+
c_in
)
*
input_height
*
input_width
;
int
image_hk
=
image_h
*
stride_height
+
kernel_h
;
int
image_wk
=
image_w
*
stride_width
+
kernel_w
;
T
*
addr_offset
=
filter_grad_data
+
c_out
*
filter_height
*
filter_width
;
if
(
image_hk
<
0
||
image_hk
>=
input_height
)
continue
;
const
int
h_end
=
h_in_end
<
input_height
?
h_in_end
:
input_height
;
if
(
image_wk
<
0
||
image_wk
>=
input_width
)
continue
;
const
int
w_end
=
w_in_end
<
input_width
?
w_in_end
:
input_width
;
#define gaid(N, C, H, W) \
const
int
h_start
=
h_in_start
>
0
?
h_in_start
:
0
;
((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
const
int
w_start
=
w_in_start
>
0
?
w_in_start
:
0
;
s
+=
output_grad_data
[
gaid
(
bid
,
kernel_id
,
image_h
,
image_w
)]
*
for
(
int
h_in
=
h_start
;
h_in
<
h_end
;
h_in
++
)
{
input_data
[((
bid
*
(
gridDim
.
z
/
filter_multiplier
)
+
for
(
int
w_in
=
w_start
;
w_in
<
w_end
;
w_in
++
)
{
kernel_id
/
filter_multiplier
)
*
const
int
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
input_height
+
const
T
diff_temp
=
output_grad_data
[
index
]
*
input_data
[
offset
];
image_hk
)
*
T
*
addr
=
addr_offset
+
(
h_in
-
h_in_start
)
*
filter_width
+
input_width
+
(
w_in
-
w_in_start
);
image_wk
];
paddle
::
platform
::
CudaAtomicAdd
(
addr
,
diff_temp
);
#undef gaid
}
}
}
}
}
}
#if __CUDA_ARCH__ >= 530
s
=
warpReduceSum
<
T
>
(
s
);
if
(
lid
==
0
)
paddle
::
platform
::
CudaAtomicAdd
(
&
filter_grad_data
[
gbid
],
s
);
#else
paddle
::
platform
::
CudaAtomicAdd
(
&
filter_grad_data
[
gbid
],
s
);
#endif
}
template
<
typename
T
,
int
c_filter_multiplier
>
__global__
void
KernelDepthwiseConvFilterGradSp
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
filter_grad_data
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConvFilterGrad
<
T
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
else
KernelDepthwiseConvFilterGrad
<
T
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
}
}
/*
/*
...
@@ -297,9 +177,7 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
...
@@ -297,9 +177,7 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
)
{
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
2
];
...
@@ -313,37 +191,22 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
...
@@ -313,37 +191,22 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const
int
stride_width
=
strides
[
1
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
padding_width
=
paddings
[
1
];
const
int
dilate_height
=
dilations
[
0
];
const
int
dilate_width
=
dilations
[
1
];
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
filter_data
=
filter
.
data
<
T
>
();
const
T
*
filter_data
=
filter
.
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
thread
=
512
;
int
nthreads
=
batch_size
*
output_channels
*
output_height
*
output_width
;
int
blocks
=
std
::
min
(
std
::
max
(
thread
/
output_width
,
1
),
output_height
);
int
blocks
=
(
nthreads
+
1024
-
1
)
/
1024
;
dim3
threads
(
std
::
min
(
output_width
,
thread
),
blocks
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
output_channels
,
batch_size
,
1
);
dim3
grid
(
blocks
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier, c_stride) \
KernelDepthwiseConv
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
if (c_filter_multiplier == 0 || \
nthreads
,
input_data
,
filter_data
,
batch_size
,
output_channels
,
filter_multiplier == c_filter_multiplier && \
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
stride_height == stride_width && stride_height == c_stride) { \
output_channels
/
input_channels
,
ksize_height
,
ksize_width
,
KernelDepthwiseConvSp<T, c_filter_multiplier, \
stride_height
,
stride_width
,
padding_height
,
padding_width
,
c_stride><<<grid, threads, 0, context.stream()>>>( \
output_data
);
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
return; \
}
check_case
(
1
,
1
);
check_case
(
1
,
2
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case
(
0
,
0
);
#undef check_case
}
}
};
};
...
@@ -356,7 +219,6 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -356,7 +219,6 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
input_grad
)
{
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_channels
=
input
.
dims
()[
1
];
...
@@ -371,39 +233,22 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -371,39 +233,22 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const
int
stride_width
=
strides
[
1
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
padding_width
=
paddings
[
1
];
const
int
dilate_height
=
dilations
[
0
];
const
int
dilate_width
=
dilations
[
1
];
const
T
*
filter_data
=
filter
.
data
<
T
>
();
const
T
*
filter_data
=
filter
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
thread
=
512
;
int
nthreads
=
batch_size
*
input_channels
*
input_height
*
input_width
;
int
blocks
=
std
::
min
(
std
::
max
(
thread
/
input_width
,
1
),
input_height
);
int
blocks
=
(
nthreads
+
1024
-
1
)
/
1024
;
dim3
threads
(
std
::
min
(
input_width
,
thread
),
blocks
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
input_channels
,
batch_size
,
1
);
dim3
grid
(
blocks
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
KernelDepthwiseConvInputGrad
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
#define check_case(c_filter_multiplier, c_stride) \
nthreads
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
if (c_filter_multiplier == 0 || \
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier == c_filter_multiplier && \
output_channels
/
input_channels
,
ksize_height
,
ksize_width
,
stride_height == stride_width && stride_height == c_stride) { \
stride_height
,
stride_width
,
padding_height
,
padding_width
,
KernelDepthwiseConvInputGradSp< \
input_grad_data
);
T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, input_grad_data); \
return; \
}
check_case
(
1
,
1
);
check_case
(
1
,
2
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case
(
0
,
0
);
#undef check_case
}
}
};
};
...
@@ -415,7 +260,6 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -415,7 +260,6 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
filter_grad
)
{
framework
::
Tensor
*
filter_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_channels
=
input
.
dims
()[
1
];
...
@@ -430,34 +274,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -430,34 +274,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const
int
stride_width
=
strides
[
1
];
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
padding_width
=
paddings
[
1
];
const
int
dilate_height
=
dilations
[
0
];
const
int
dilate_width
=
dilations
[
1
];
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
block_size
=
512
;
int
nthreads
=
batch_size
*
output_channels
*
output_height
*
output_width
;
int
crop_output_height
=
std
::
min
(
std
::
max
(
block_size
/
output_width
,
1
),
output_height
);
int
blocks
=
(
nthreads
+
1024
-
1
)
/
1024
;
dim3
grid
(
ksize_width
,
ksize_height
,
output_channels
);
dim3
threads
(
1024
,
1
);
dim3
threads
(
std
::
min
(
output_width
,
block_size
),
crop_output_height
,
1
);
dim3
grid
(
blocks
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
KernelDepthwiseConvFilterGrad
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
#define check_case(c_filter_multiplier) \
nthreads
,
output_grad_data
,
input_data
,
batch_size
,
output_channels
,
if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
KernelDepthwiseConvFilterGradSp< \
output_channels
/
input_channels
,
ksize_height
,
ksize_width
,
T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>( \
stride_height
,
stride_width
,
padding_height
,
padding_width
,
output_grad_data, input_data, batch_size, output_channels, \
filter_grad_data
);
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
return; \
}
check_case
(
1
);
check_case
(
0
);
#undef check_case
}
}
};
};
...
...
paddle/fluid/operators/math/depthwise_conv.h
浏览文件 @
e06f2f33
...
@@ -32,8 +32,7 @@ class DepthwiseConvFunctor {
...
@@ -32,8 +32,7 @@ class DepthwiseConvFunctor {
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
);
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
output
);
};
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
...
@@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor {
...
@@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
input_grad
);
framework
::
Tensor
*
input_grad
);
};
};
...
@@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor {
...
@@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
filter_grad
);
framework
::
Tensor
*
filter_grad
);
};
};
...
...
paddle/fluid/operators/reduce_mean_op.cu
浏览文件 @
e06f2f33
...
@@ -12,64 +12,17 @@
...
@@ -12,64 +12,17 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <vector>
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_mean_op.h"
#include "paddle/fluid/operators/reduce_mean_op.h"
namespace
paddle
{
REGISTER_OP_CUDA_KERNEL
(
reduce_mean
,
namespace
operators
{
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MeanFunctor
>
,
template
<
typename
T
>
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
struct
DivideFunctor
{
double
,
ops
::
MeanFunctor
>
,
HOSTDEVICE
explicit
inline
DivideFunctor
(
int
n
)
:
n_inv
((
T
)(
1.0
/
n
))
{}
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MeanFunctor
>
,
HOSTDEVICE
inline
T
operator
()(
const
T
&
x
)
const
{
return
x
*
n_inv
;
}
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MeanFunctor
>
);
private:
T
n_inv
;
};
template
<
typename
T
>
class
ReduceMeanKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
context
.
Attr
<
bool
>
(
"keep_dim"
);
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
reduce_dims
.
resize
(
input
->
dims
().
size
());
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
reduce_dims
[
i
]
=
i
;
}
else
{
for
(
auto
e
:
dims
)
{
reduce_dims
.
push_back
(
e
>=
0
?
e
:
e
+
input
->
dims
().
size
());
}
}
int
reduce_num
=
1
;
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
{
reduce_num
*=
input
->
dims
()[
reduce_dims
[
i
]];
}
auto
stream
=
context
.
cuda_device_context
().
stream
();
TensorReduce
<
T
,
T
,
cub
::
Sum
,
DivideFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
DivideFunctor
<
T
>
(
reduce_num
),
stream
);
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
reduce_mean
,
ops
::
ReduceMeanKernel
<
float
>
,
ops
::
ReduceMeanKernel
<
double
>
,
ops
::
ReduceMeanKernel
<
int
>
,
ops
::
ReduceMeanKernel
<
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
reduce_mean_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
reduce_mean_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MeanGradFunctor
>
,
float
,
ops
::
MeanGradFunctor
>
,
...
...
paddle/fluid/operators/reduce_sum_op.cu
浏览文件 @
e06f2f33
...
@@ -12,59 +12,17 @@
...
@@ -12,59 +12,17 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h"
#include "paddle/fluid/operators/reduce_sum_op.h"
namespace
paddle
{
REGISTER_OP_CUDA_KERNEL
(
reduce_sum
,
namespace
operators
{
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
SumFunctor
>
,
template
<
typename
T
>
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
struct
IdentityFunctor
{
double
,
ops
::
SumFunctor
>
,
HOSTDEVICE
explicit
inline
IdentityFunctor
()
{}
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
SumFunctor
>
,
HOSTDEVICE
inline
T
operator
()(
const
T
&
x
)
const
{
return
x
;
}
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
};
int64_t
,
ops
::
SumFunctor
>
);
template
<
typename
T
>
class
ReduceSumKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
context
.
Attr
<
bool
>
(
"keep_dim"
);
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
reduce_dims
.
resize
(
input
->
dims
().
size
());
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
reduce_dims
[
i
]
=
i
;
}
else
{
for
(
auto
e
:
dims
)
{
reduce_dims
.
push_back
(
e
>=
0
?
e
:
e
+
input
->
dims
().
size
());
}
}
int
reduce_num
=
1
;
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
{
reduce_num
*=
input
->
dims
()[
reduce_dims
[
i
]];
}
auto
stream
=
context
.
cuda_device_context
().
stream
();
TensorReduce
<
T
,
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
reduce_sum
,
ops
::
ReduceSumKernel
<
float
>
,
ops
::
ReduceSumKernel
<
double
>
,
ops
::
ReduceSumKernel
<
int
>
,
ops
::
ReduceSumKernel
<
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
reduce_sum_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
SumGradFunctor
>
,
float
,
ops
::
SumGradFunctor
>
,
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
e06f2f33
...
@@ -842,6 +842,13 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
...
@@ -842,6 +842,13 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
def
shuffle
(
reader
,
buffer_size
):
def
shuffle
(
reader
,
buffer_size
):
"""
"""
Shuffle the reader.
Shuffle the reader.
Args:
reader(Variable): The reader to be decorated with 'shuffling'.
buffer_size(int): The pre-read number of data in :code:`reader`.
Returns:
Variable: The reader which has been decorated with 'shuffling'.
"""
"""
return
__create_unshared_decorated_reader__
(
return
__create_unshared_decorated_reader__
(
'create_shuffle_reader'
,
reader
,
{
'buffer_size'
:
int
(
buffer_size
)})
'create_shuffle_reader'
,
reader
,
{
'buffer_size'
:
int
(
buffer_size
)})
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
e06f2f33
...
@@ -6367,7 +6367,7 @@ def stack(x, axis=0):
...
@@ -6367,7 +6367,7 @@ def stack(x, axis=0):
if
not
isinstance
(
x
,
list
)
and
not
isinstance
(
x
,
tuple
):
if
not
isinstance
(
x
,
list
)
and
not
isinstance
(
x
,
tuple
):
x
=
[
x
]
x
=
[
x
]
out
=
helper
.
create_tmp_variable
(
x
[
0
].
dtype
)
out
=
helper
.
create_tmp_variable
(
dtype
=
x
[
0
].
dtype
)
helper
.
append_op
(
helper
.
append_op
(
type
=
'stack'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Y'
:
out
},
type
=
'stack'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Y'
:
out
},
attrs
=
{
'axis'
:
axis
})
attrs
=
{
'axis'
:
axis
})
...
@@ -6404,8 +6404,8 @@ def unstack(x, axis=0, num=None):
...
@@ -6404,8 +6404,8 @@ def unstack(x, axis=0, num=None):
num
=
x
.
shape
[
axis
]
num
=
x
.
shape
[
axis
]
outs
=
[]
outs
=
[]
for
_
in
num
:
for
_
in
xrange
(
num
)
:
outs
.
append
(
helper
.
create_tmp_variable
(
x
.
dtype
))
outs
.
append
(
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
))
helper
.
append_op
(
helper
.
append_op
(
type
=
'unstack'
,
type
=
'unstack'
,
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
e06f2f33
...
@@ -67,7 +67,6 @@ class TestConv2dOp(OpTest):
...
@@ -67,7 +67,6 @@ class TestConv2dOp(OpTest):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"conv2d"
self
.
op_type
=
"conv2d"
self
.
use_cudnn
=
False
self
.
use_cudnn
=
False
self
.
use_cuda
=
False
self
.
use_mkldnn
=
False
self
.
use_mkldnn
=
False
self
.
data_format
=
"AnyLayout"
self
.
data_format
=
"AnyLayout"
self
.
dtype
=
np
.
float32
self
.
dtype
=
np
.
float32
...
@@ -102,25 +101,24 @@ class TestConv2dOp(OpTest):
...
@@ -102,25 +101,24 @@ class TestConv2dOp(OpTest):
}
}
self
.
outputs
=
{
'Output'
:
output
}
self
.
outputs
=
{
'Output'
:
output
}
def
testcuda
(
self
):
def
testcudnn
(
self
):
return
core
.
is_compiled_with_cuda
()
and
(
self
.
use_cudnn
or
return
core
.
is_compiled_with_cuda
()
and
self
.
use_cudnn
self
.
use_cuda
)
def
test_check_output
(
self
):
def
test_check_output
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
self
.
check_output_with_place
(
place
,
atol
=
1e-5
)
self
.
check_output_with_place
(
place
,
atol
=
1e-5
)
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
if
self
.
dtype
==
np
.
float16
:
return
return
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
self
.
check_grad_with_place
(
self
.
check_grad_with_place
(
place
,
set
([
'Input'
,
'Filter'
]),
'Output'
,
max_relative_error
=
0.02
)
place
,
set
([
'Input'
,
'Filter'
]),
'Output'
,
max_relative_error
=
0.02
)
def
test_check_grad_no_filter
(
self
):
def
test_check_grad_no_filter
(
self
):
if
self
.
dtype
==
np
.
float16
:
if
self
.
dtype
==
np
.
float16
:
return
return
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
self
.
check_grad_with_place
(
self
.
check_grad_with_place
(
place
,
[
'Input'
],
place
,
[
'Input'
],
'Output'
,
'Output'
,
...
@@ -130,7 +128,7 @@ class TestConv2dOp(OpTest):
...
@@ -130,7 +128,7 @@ class TestConv2dOp(OpTest):
def
test_check_grad_no_input
(
self
):
def
test_check_grad_no_input
(
self
):
if
self
.
dtype
==
np
.
float16
:
if
self
.
dtype
==
np
.
float16
:
return
return
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
self
.
check_grad_with_place
(
self
.
check_grad_with_place
(
place
,
[
'Filter'
],
place
,
[
'Filter'
],
'Output'
,
'Output'
,
...
@@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
...
@@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
class
TestDepthwiseConv
(
TestConv2dOp
):
class
TestDepthwiseConv
(
TestConv2dOp
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
self
.
groups
=
3
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
3
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConv2
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
3
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConv3
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConvWithDilation
(
TestConv2dOp
):
class
TestDepthwiseConv2
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
self
.
dilations
=
[
2
,
2
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConvWithDilation2
(
TestConv2dOp
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
self
.
groups
=
3
self
.
dilations
=
[
2
,
2
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录