Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
df4a3544
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
df4a3544
编写于
11月 01, 2018
作者:
D
dengkaipeng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
nearest neighbor interp add cuda kernel. test=develop
上级
97556119
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
111 addition
and
93 deletion
+111
-93
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-0
paddle/fluid/operators/nearest_neighbor_interp_op.cc
paddle/fluid/operators/nearest_neighbor_interp_op.cc
+5
-4
paddle/fluid/operators/nearest_neighbor_interp_op.cu
paddle/fluid/operators/nearest_neighbor_interp_op.cu
+63
-86
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+32
-3
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+10
-0
未找到文件。
paddle/fluid/API.spec
浏览文件 @
df4a3544
...
...
@@ -121,6 +121,7 @@ paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], vararg
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/operators/nearest_neighbor_interp_op.cc
浏览文件 @
df4a3544
...
...
@@ -25,9 +25,9 @@ class NearestNeighborInterpOp : public framework::OperatorWithKernel {
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of
Bilinea
rInterOp should not be null."
);
"Input(X) of
NearestNeighbo
rInterOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of
Bilinea
rInterOp should not be null."
);
"Output(Out) of
NearestNeighbo
rInterOp should not be null."
);
auto
dim_x
=
ctx
->
GetInputDim
(
"X"
);
// NCHW format
int
out_h
=
ctx
->
Attrs
().
Get
<
int
>
(
"out_h"
);
...
...
@@ -64,8 +64,9 @@ class NearestNeighborInterpOpMaker : public framework::OpProtoAndCheckerMaker {
.
AsDispensable
();
AddOutput
(
"Out"
,
"The dimension of output is (N x C x out_h x out_w)"
);
AddAttr
<
int
>
(
"out_h"
,
"output height of bilinear interpolation op."
);
AddAttr
<
int
>
(
"out_w"
,
"output width of bilinear interpolation op."
);
AddAttr
<
int
>
(
"out_h"
,
"output height of nearest neighbor interpolation op."
);
AddAttr
<
int
>
(
"out_w"
,
"output width of nearest neighbor interpolation op."
);
AddComment
(
R"DOC(
Nearest neighbor interpolation is to perform nearest neighbor interpolation
in bot the 3rd dimention(in height direction) and the 4th dimention(in width
...
...
paddle/fluid/operators/nearest_neighbor_interp_op.cu
浏览文件 @
df4a3544
...
...
@@ -15,17 +15,14 @@
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
using
framework
::
Tensor
;
template
<
typename
T
>
__global__
void
Ke
Bilinea
rInterpFw
(
__global__
void
Ke
NearestNeighbo
rInterpFw
(
const
T
*
in
,
const
size_t
in_img_h
,
const
size_t
in_img_w
,
const
size_t
input_h
,
const
size_t
input_w
,
T
*
out
,
const
size_t
out_img_h
,
const
size_t
out_img_w
,
const
size_t
output_h
,
const
size_t
output_w
,
const
size_t
num_channels
,
const
T
ratio_h
,
const
T
ratio
W
)
{
const
size_t
num_channels
,
const
T
ratio_h
,
const
T
ratio
_w
)
{
int
nthreads
=
output_h
*
output_w
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
<
nthreads
)
{
...
...
@@ -36,34 +33,22 @@ __global__ void KeBilinearInterpFw(
int
channel_id
=
out_id_w
/
out_img_size
;
int
out_img_idy
=
(
out_id_w
%
out_img_size
)
/
out_img_w
;
int
in_img_idy
=
ratio_h
*
out_img_idy
;
int
h_id
=
(
in_img_idy
<
in_img_h
-
1
)
?
1
:
0
;
T
h1lambda
=
ratio_h
*
out_img_idy
-
in_img_idy
;
T
h2lambda
=
1.
f
-
h1lambda
;
int
in_img_idy
=
static_cast
<
int
>
(
round
(
ratio_h
*
out_img_idy
));
int
out_img_idx
=
tid
%
out_img_w
;
int
in_img_idx
=
ratioW
*
out_img_idx
;
int
w_id
=
(
in_img_idx
<
in_img_w
-
1
)
?
1
:
0
;
T
w1lambda
=
ratioW
*
out_img_idx
-
in_img_idx
;
T
w2lambda
=
1.
f
-
w1lambda
;
int
in_img_idx
=
static_cast
<
int
>
(
round
(
ratio_w
*
out_img_idx
));
const
T
*
in_pos
=
&
in
[
out_id_h
*
input_w
+
channel_id
*
in_img_size
+
out
[
tid
]
=
in
[
out_id_h
*
input_w
+
channel_id
*
in_img_size
+
in_img_idy
*
in_img_w
+
in_img_idx
];
// bilinear interpolation
out
[
out_id_h
*
output_w
+
out_id_w
]
=
h2lambda
*
(
w2lambda
*
in_pos
[
0
]
+
w1lambda
*
in_pos
[
w_id
])
+
h1lambda
*
(
w2lambda
*
in_pos
[
h_id
*
in_img_w
]
+
w1lambda
*
in_pos
[
h_id
*
in_img_w
+
w_id
]);
}
}
template
<
typename
T
>
__global__
void
Ke
Bilinea
rInterpBw
(
__global__
void
Ke
NearestNeighbo
rInterpBw
(
T
*
in
,
const
size_t
in_img_h
,
const
size_t
in_img_w
,
const
size_t
input_h
,
const
size_t
input_w
,
const
T
*
out
,
const
size_t
out_img_h
,
const
size_t
out_img_w
,
const
size_t
output_h
,
const
size_t
output_w
,
const
size_t
num_channels
,
const
T
ratio_h
,
const
T
ratio
W
)
{
const
size_t
num_channels
,
const
T
ratio_h
,
const
T
ratio
_w
)
{
int
nthreads
=
output_h
*
output_w
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
<
nthreads
)
{
...
...
@@ -74,25 +59,15 @@ __global__ void KeBilinearInterpBw(
int
channel_id
=
out_id_w
/
out_img_size
;
int
out_img_idy
=
(
out_id_w
%
out_img_size
)
/
out_img_w
;
int
in_img_idy
=
ratio_h
*
out_img_idy
;
int
h_id
=
(
in_img_idy
<
in_img_h
-
1
)
?
1
:
0
;
T
h1lambda
=
ratio_h
*
out_img_idy
-
in_img_idy
;
T
h2lambda
=
1.
f
-
h1lambda
;
int
in_img_idy
=
static_cast
<
int
>
(
round
(
ratio_h
*
out_img_idy
));
int
out_img_idx
=
tid
%
out_img_w
;
int
in_img_idx
=
ratioW
*
out_img_idx
;
int
w_id
=
(
in_img_idx
<
in_img_w
-
1
)
?
1
:
0
;
T
w1lambda
=
ratioW
*
out_img_idx
-
in_img_idx
;
T
w2lambda
=
1.
f
-
w1lambda
;
int
in_img_idx
=
static_cast
<
int
>
(
round
(
ratio_w
*
out_img_idx
));
T
*
in_pos
=
&
in
[
out_id_h
*
input_w
+
channel_id
*
in_img_size
+
in_img_idy
*
in_img_w
+
in_img_idx
];
const
T
*
out_pos
=
&
out
[
out_id_h
*
output_w
+
out_id_w
];
atomicAdd
(
&
in_pos
[
0
],
h2lambda
*
w2lambda
*
out_pos
[
0
]);
atomicAdd
(
&
in_pos
[
w_id
],
h2lambda
*
w1lambda
*
out_pos
[
0
]);
atomicAdd
(
&
in_pos
[
h_id
*
in_img_w
],
h1lambda
*
w2lambda
*
out_pos
[
0
]);
atomicAdd
(
&
in_pos
[
h_id
*
in_img_w
+
w_id
],
h1lambda
*
w1lambda
*
out_pos
[
0
]);
const
T
out_pos
=
out
[
out_id_h
*
output_w
+
out_id_w
];
atomicAdd
(
in_pos
,
out_pos
);
}
}
...
...
@@ -102,48 +77,49 @@ class NearestNeighborInterpOpCUDAKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"This kernel only runs on GPU device."
);
auto
*
input
_t
=
ctx
.
Input
<
Tensor
>
(
"X"
);
// float tensor
auto
*
output
_t
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
// float tensor
auto
*
input
=
input_
t
->
data
<
T
>
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
// float tensor
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
// float tensor
auto
*
input
_data
=
inpu
t
->
data
<
T
>
();
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_dims
=
output_t
->
dims
();
auto
out_size_t
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size_t
!=
nullptr
)
{
auto
out_size
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
Tensor
sizes
;
framework
::
TensorCopy
(
*
out_size
_t
,
platform
::
CPUPlace
(),
&
sizes
);
framework
::
TensorCopy
(
*
out_size
,
platform
::
CPUPlace
(),
&
sizes
);
auto
size_data
=
sizes
.
data
<
int
>
();
out_h
=
size_data
[
0
];
out_w
=
size_data
[
1
];
}
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
{
out_dims
[
0
],
out_dims
[
1
],
out_h
,
out_w
},
ctx
.
GetPlace
());
int
batch_size
=
input_t
->
dims
()[
0
];
int
channels
=
input_t
->
dims
()[
1
];
int
in_h
=
input_t
->
dims
()[
2
];
int
in_w
=
input_t
->
dims
()[
3
];
int
n
=
input
->
dims
()[
0
];
int
c
=
input
->
dims
()[
1
];
int
in_h
=
input
->
dims
()[
2
];
int
in_w
=
input
->
dims
()[
3
];
auto
*
output_data
=
output
->
mutable_data
<
T
>
({
n
,
c
,
out_h
,
out_w
},
ctx
.
GetPlace
());
int
in_hw
=
in_h
*
in_w
;
int
out_hw
=
out_h
*
out_w
;
int
in_chw
=
c
hannels
*
in_hw
;
int
out_chw
=
c
hannels
*
out_hw
;
int
in_chw
=
c
*
in_hw
;
int
out_chw
=
c
*
out_hw
;
T
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
T
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
T
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
T
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
memcpy
(
output
,
input
,
input_t
->
numel
()
*
sizeof
(
T
));
}
else
{
int
threadNum
=
batch_size
*
out_chw
;
memcpy
(
output_data
,
input_data
,
input
->
numel
()
*
sizeof
(
T
));
return
;
}
int
threadNum
=
n
*
out_chw
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
KeBilinea
rInterpFw
<
KeNearestNeighbo
rInterpFw
<
T
><<<
blocks
,
1024
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
input
,
in_h
,
in_w
,
batch_size
,
in_chw
,
output
,
out_h
,
out_w
,
batch_size
,
out_chw
,
channels
,
ratio_h
,
ratio_w
);
}
input_data
,
in_h
,
in_w
,
n
,
in_chw
,
output_data
,
out_h
,
out_w
,
n
,
out_chw
,
c
,
ratio_h
,
ratio_w
);
}
};
...
...
@@ -151,52 +127,53 @@ template <typename T>
class
NearestNeighborInterpGradOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_input_t
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_output_t
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_output
=
d_output_t
->
data
<
T
>
();
auto
*
d_input
=
d_input_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
output_grad_data
=
output_grad
->
data
<
T
>
();
auto
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
device_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
zero
;
zero
(
device_ctx
,
d_input_t
,
static_cast
<
T
>
(
0.0
));
zero
(
device_ctx
,
input_grad
,
static_cast
<
T
>
(
0.0
));
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_size
_t
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size
_t
!=
nullptr
)
{
auto
out_size
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
Tensor
sizes
;
framework
::
TensorCopy
(
*
out_size
_t
,
platform
::
CPUPlace
(),
&
sizes
);
framework
::
TensorCopy
(
*
out_size
,
platform
::
CPUPlace
(),
&
sizes
);
auto
size_data
=
sizes
.
data
<
int
>
();
out_h
=
size_data
[
0
];
out_w
=
size_data
[
1
];
}
int
batch_size
=
d_input_t
->
dims
()[
0
];
int
c
hannels
=
d_input_t
->
dims
()[
1
];
int
in_h
=
d_input_t
->
dims
()[
2
];
int
in_w
=
d_input_t
->
dims
()[
3
];
int
n
=
input_grad
->
dims
()[
0
];
int
c
=
input_grad
->
dims
()[
1
];
int
in_h
=
input_grad
->
dims
()[
2
];
int
in_w
=
input_grad
->
dims
()[
3
];
int
in_hw
=
in_h
*
in_w
;
int
out_hw
=
out_h
*
out_w
;
int
in_chw
=
c
hannels
*
in_hw
;
int
out_chw
=
c
hannels
*
out_hw
;
int
in_chw
=
c
*
in_hw
;
int
out_chw
=
c
*
out_hw
;
T
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
T
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
T
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
T
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
memcpy
(
d_input
,
d_output
,
d_input_t
->
numel
()
*
sizeof
(
T
));
}
else
{
int
threadNum
=
batch_size
*
out_chw
;
memcpy
(
input_grad
,
output_grad
,
input_grad
->
numel
()
*
sizeof
(
T
));
return
;
}
int
threadNum
=
n
*
out_chw
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
KeBilinea
rInterpBw
<
KeNearestNeighbo
rInterpBw
<
T
><<<
blocks
,
1024
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
d_input
,
in_h
,
in_w
,
batch_size
,
in_chw
,
d_output
,
out_h
,
out_w
,
batch_size
,
out_chw
,
channels
,
ratio_h
,
ratio_w
);
}
input_grad_data
,
in_h
,
in_w
,
n
,
in_chw
,
output_grad_data
,
out_h
,
out_w
,
n
,
out_chw
,
c
,
ratio_h
,
ratio_w
);
}
};
...
...
@@ -206,5 +183,5 @@ class NearestNeighborInterpGradOpCUDAKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
nearest_neighbor_interp
,
ops
::
NearestNeighborInterpOpCUDAKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
nearest_neighborinterp_grad
,
REGISTER_OP_CUDA_KERNEL
(
nearest_neighbor
_
interp_grad
,
ops
::
NearestNeighborInterpGradOpCUDAKernel
<
float
>
);
python/paddle/fluid/layers/nn.py
浏览文件 @
df4a3544
...
...
@@ -101,6 +101,7 @@ __all__ = [
'image_resize'
,
'image_resize_short'
,
'resize_bilinear'
,
'resize_nearest'
,
'gather'
,
'scatter'
,
'sequence_scatter'
,
...
...
@@ -5584,6 +5585,7 @@ def image_resize(input,
Supporting resample methods:
'BILINEAR' : Bilinear interpolation
'NEAREST' : Nearest neighbor interpolation
Args:
input (Variable): The input tensor of image resize layer,
...
...
@@ -5610,13 +5612,17 @@ def image_resize(input,
out = fluid.layers.image_resize(input, out_shape=[12, 12])
"""
resample_methods
=
{
'BILINEAR'
:
'bilinear_interp'
}
resample_methods
=
{
'BILINEAR'
:
'bilinear_interp'
,
'NEAREST'
:
'nearest_neighbor_interp'
}
if
resample
not
in
resample_methods
:
raise
ValueError
(
"The 'resample' of image_resize can only be 'BILINEAR' currently."
)
"The 'resample' of image_resize can only be 'BILINEAR' and 'NEAREST' currently."
)
if
out_shape
is
None
and
scale
is
None
:
raise
ValueError
(
"One of out_shape and scale must not be None"
)
helper
=
LayerHelper
(
'bilinear_interp'
,
**
locals
())
helper
=
LayerHelper
(
resample_methods
[
resample
]
,
**
locals
())
dtype
=
helper
.
input_dtype
()
def
_is_list_or_turple_
(
data
):
...
...
@@ -5672,6 +5678,29 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'BILINEAR'
)
@
templatedoc
(
op_type
=
"bilinear_interp"
)
def
resize_nearest
(
input
,
out_shape
=
None
,
scale
=
None
,
name
=
None
):
"""
${comment}
Args:
input(${x_type}): ${x_comment}.
out_shape(${out_size_type}): ${out_size_comment}.
scale(float|None): The multiplier for the input height or width. At
least one of out_shape or scale must be set. And out_shape has
a higher priority than scale. Default: None.
name(str|None): The output variable name.
Returns:
${out_comment}.
"""
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'NEAREST'
)
def
image_resize_short
(
input
,
out_short_len
,
resample
=
'BILINEAR'
):
"""
Resize a batch of images. The short edge of input images will be
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
df4a3544
...
...
@@ -485,6 +485,16 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_resize_bilinear
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
9
,
6
],
dtype
=
"float32"
)
output
=
layers
.
resize_nearest
(
x
,
out_shape
=
[
12
,
12
])
self
.
assertIsNotNone
(
output
)
output
=
layers
.
resize_nearest
(
x
,
scale
=
3
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_polygon_box_transform
(
self
):
program
=
Program
()
with
program_guard
(
program
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录