Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
31363c3f
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
31363c3f
编写于
3月 23, 2022
作者:
P
phlrain
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add_some_yaml_config
上级
da478d1e
3980e222
变更
50
隐藏空白更改
内联
并排
Showing
50 changed file
with
3144 addition
and
3606 deletion
+3144
-3606
paddle/fluid/framework/infershape_utils.cc
paddle/fluid/framework/infershape_utils.cc
+158
-197
paddle/fluid/framework/infershape_utils.h
paddle/fluid/framework/infershape_utils.h
+59
-1
paddle/fluid/framework/new_executor/workqueue/workqueue.h
paddle/fluid/framework/new_executor/workqueue/workqueue.h
+58
-0
paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
.../fluid/framework/new_executor/workqueue/workqueue_test.cc
+6
-0
paddle/fluid/operators/deformable_conv_func.h
paddle/fluid/operators/deformable_conv_func.h
+0
-149
paddle/fluid/operators/deformable_conv_op.cc
paddle/fluid/operators/deformable_conv_op.cc
+9
-159
paddle/fluid/operators/deformable_conv_op.cu
paddle/fluid/operators/deformable_conv_op.cu
+0
-643
paddle/fluid/operators/deformable_conv_op.h
paddle/fluid/operators/deformable_conv_op.h
+0
-509
paddle/fluid/operators/deformable_conv_v1_op.cc
paddle/fluid/operators/deformable_conv_v1_op.cc
+9
-132
paddle/fluid/operators/deformable_conv_v1_op.cu
paddle/fluid/operators/deformable_conv_v1_op.cu
+0
-604
paddle/fluid/operators/deformable_conv_v1_op.h
paddle/fluid/operators/deformable_conv_v1_op.h
+0
-556
paddle/fluid/operators/flatten_op.cc
paddle/fluid/operators/flatten_op.cc
+13
-83
paddle/fluid/operators/flatten_op.cu.cc
paddle/fluid/operators/flatten_op.cu.cc
+0
-31
paddle/fluid/operators/flatten_op.h
paddle/fluid/operators/flatten_op.h
+0
-41
paddle/fluid/operators/flatten_op_xpu.cc
paddle/fluid/operators/flatten_op_xpu.cc
+0
-23
paddle/fluid/pybind/eager_method.cc
paddle/fluid/pybind/eager_method.cc
+56
-9
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+1
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-0
paddle/phi/api/include/tensor.h
paddle/phi/api/include/tensor.h
+1
-3
paddle/phi/api/lib/tensor_method.cc
paddle/phi/api/lib/tensor_method.cc
+6
-6
paddle/phi/infermeta/multiary.cc
paddle/phi/infermeta/multiary.cc
+209
-0
paddle/phi/infermeta/multiary.h
paddle/phi/infermeta/multiary.h
+13
-0
paddle/phi/infermeta/unary.cc
paddle/phi/infermeta/unary.cc
+16
-0
paddle/phi/infermeta/unary.h
paddle/phi/infermeta/unary.h
+6
-0
paddle/phi/kernels/CMakeLists.txt
paddle/phi/kernels/CMakeLists.txt
+3
-1
paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+333
-0
paddle/phi/kernels/cpu/deformable_conv_kernel.cc
paddle/phi/kernels/cpu/deformable_conv_kernel.cc
+0
-120
paddle/phi/kernels/deformable_conv_grad_kernel.h
paddle/phi/kernels/deformable_conv_grad_kernel.h
+39
-0
paddle/phi/kernels/deformable_conv_kernel.h
paddle/phi/kernels/deformable_conv_kernel.h
+2
-1
paddle/phi/kernels/flatten_grad_kernel.cc
paddle/phi/kernels/flatten_grad_kernel.cc
+1
-0
paddle/phi/kernels/flatten_kernel.cc
paddle/phi/kernels/flatten_kernel.cc
+1
-1
paddle/phi/kernels/funcs/CMakeLists.txt
paddle/phi/kernels/funcs/CMakeLists.txt
+1
-0
paddle/phi/kernels/funcs/deformable_conv_functor.cc
paddle/phi/kernels/funcs/deformable_conv_functor.cc
+172
-0
paddle/phi/kernels/funcs/deformable_conv_functor.cu
paddle/phi/kernels/funcs/deformable_conv_functor.cu
+185
-0
paddle/phi/kernels/funcs/deformable_conv_functor.h
paddle/phi/kernels/funcs/deformable_conv_functor.h
+74
-0
paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+366
-0
paddle/phi/kernels/gpu/deformable_conv_kernel.cu
paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+0
-134
paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+364
-0
paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+22
-68
paddle/phi/ops/compat/deformable_conv_sig.cc
paddle/phi/ops/compat/deformable_conv_sig.cc
+28
-0
python/paddle/distributed/auto_parallel/dist_loader.py
python/paddle/distributed/auto_parallel/dist_loader.py
+26
-6
python/paddle/distributed/auto_parallel/dist_saver.py
python/paddle/distributed/auto_parallel/dist_saver.py
+241
-0
python/paddle/distributed/auto_parallel/engine.py
python/paddle/distributed/auto_parallel/engine.py
+236
-117
python/paddle/distributed/auto_parallel/utils.py
python/paddle/distributed/auto_parallel/utils.py
+8
-0
python/paddle/fluid/dygraph/varbase_patch_methods.py
python/paddle/fluid/dygraph/varbase_patch_methods.py
+32
-0
python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
.../paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+10
-5
python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py
...fluid/tests/unittests/auto_parallel/engine_predict_api.py
+122
-0
python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
...le/fluid/tests/unittests/auto_parallel/test_engine_api.py
+28
-0
python/paddle/fluid/tests/unittests/test_egr_python_api.py
python/paddle/fluid/tests/unittests/test_egr_python_api.py
+54
-7
python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
.../paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
+174
-0
未找到文件。
paddle/fluid/framework/infershape_utils.cc
浏览文件 @
31363c3f
...
@@ -27,7 +27,6 @@ limitations under the License. */
...
@@ -27,7 +27,6 @@ limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/tensor_utils.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -101,235 +100,197 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
...
@@ -101,235 +100,197 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
const
InferShapeContext
&
ctx_
;
const
InferShapeContext
&
ctx_
;
};
};
// TODO(chenweihang): Support TensorArray later
int64_t
CompatMetaTensor
::
numel
()
const
{
class
CompatMetaTensor
:
public
phi
::
MetaTensor
{
if
(
is_runtime_
)
{
public:
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
CompatMetaTensor
(
InferShapeVarPtr
var
,
bool
is_runtime
)
return
var
->
Get
<
Tensor
>
().
numel
();
:
var_
(
std
::
move
(
var
)),
is_runtime_
(
is_runtime
)
{}
}
else
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
CompatMetaTensor
()
=
default
;
return
var
->
ElementSize
();
CompatMetaTensor
(
const
CompatMetaTensor
&
)
=
default
;
CompatMetaTensor
(
CompatMetaTensor
&&
)
=
default
;
CompatMetaTensor
&
operator
=
(
const
CompatMetaTensor
&
)
=
delete
;
CompatMetaTensor
&
operator
=
(
CompatMetaTensor
&&
)
=
delete
;
int64_t
numel
()
const
override
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
return
var
->
Get
<
Tensor
>
().
numel
();
}
else
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
return
var
->
ElementSize
();
}
}
}
}
DDim
dims
()
const
override
{
DDim
CompatMetaTensor
::
dims
()
const
{
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
return
var
->
Get
<
phi
::
DenseTensor
>
().
dims
();
return
var
->
Get
<
phi
::
DenseTensor
>
().
dims
();
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
return
var
->
Get
<
phi
::
SelectedRows
>
().
dims
();
return
var
->
Get
<
phi
::
SelectedRows
>
().
dims
();
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
// use tensor array size as dims
// use tensor array size as dims
auto
&
tensor_array
=
var
->
Get
<
framework
::
LoDTensorArray
>
();
auto
&
tensor_array
=
var
->
Get
<
framework
::
LoDTensorArray
>
();
return
phi
::
make_ddim
({
static_cast
<
int64_t
>
(
tensor_array
.
size
())});
return
phi
::
make_ddim
({
static_cast
<
int64_t
>
(
tensor_array
.
size
())});
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can get dims from DenseTensor or SelectedRows or "
"DenseTensorArray."
));
}
}
else
{
}
else
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can get dims from DenseTensor or SelectedRows or "
return
var
->
GetShape
().
empty
()
?
phi
::
make_ddim
({
0UL
})
"DenseTensorArray."
));
:
phi
::
make_ddim
(
var
->
GetShape
());
}
}
}
else
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
return
var
->
GetShape
().
empty
()
?
phi
::
make_ddim
({
0UL
})
:
phi
::
make_ddim
(
var
->
GetShape
());
}
}
}
phi
::
DataType
dtype
()
const
override
{
phi
::
DataType
CompatMetaTensor
::
dtype
()
const
{
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
return
var
->
Get
<
phi
::
DenseTensor
>
().
dtype
();
return
var
->
Get
<
phi
::
DenseTensor
>
().
dtype
();
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
return
var
->
Get
<
phi
::
SelectedRows
>
().
dtype
();
return
var
->
Get
<
phi
::
SelectedRows
>
().
dtype
();
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
// NOTE(chenweihang): do nothing
// NOTE(chenweihang): do nothing
// Unsupported get dtype from LoDTensorArray now
// Unsupported get dtype from LoDTensorArray now
return
phi
::
DataType
::
UNDEFINED
;
return
phi
::
DataType
::
UNDEFINED
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can get dtype from DenseTensor or SelectedRows."
));
}
}
else
{
}
else
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
return
paddle
::
framework
::
TransToPhiDataType
(
var
->
GetDataType
(
));
"Currently, only can get dtype from DenseTensor or SelectedRows."
));
}
}
}
else
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
return
paddle
::
framework
::
TransToPhiDataType
(
var
->
GetDataType
());
}
}
}
DataLayout
layout
()
const
override
{
DataLayout
CompatMetaTensor
::
layout
()
const
{
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
return
var
->
Get
<
phi
::
DenseTensor
>
().
layout
();
return
var
->
Get
<
phi
::
DenseTensor
>
().
layout
();
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
return
var
->
Get
<
phi
::
SelectedRows
>
().
layout
();
return
var
->
Get
<
phi
::
SelectedRows
>
().
layout
();
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
// NOTE(chenweihang): do nothing
// Unsupported get layout from LoDTensorArray now
return
phi
::
DataLayout
::
UNDEFINED
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can get layout from DenseTensor or "
"SelectedRows."
));
}
}
else
{
// NOTE(chenweihang): do nothing
// NOTE(chenweihang): do nothing
// Unsupported get layout for VarDesc now
// Unsupported get layout from LoDTensorArray now
return
DataLayout
::
UNDEFINED
;
return
phi
::
DataLayout
::
UNDEFINED
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can get layout from DenseTensor or "
"SelectedRows."
));
}
}
}
else
{
// NOTE(chenweihang): do nothing
// Unsupported get layout for VarDesc now
return
DataLayout
::
UNDEFINED
;
}
}
}
void
set_dims
(
const
DDim
&
dims
)
override
{
void
CompatMetaTensor
::
set_dims
(
const
DDim
&
dims
)
{
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dims
=
dims
;
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dims
=
dims
;
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
phi
::
SelectedRows
>
()
->
mutable_value
();
auto
*
tensor
=
var
->
GetMutable
<
phi
::
SelectedRows
>
()
->
mutable_value
();
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dims
=
dims
;
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dims
=
dims
;
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
auto
*
tensor_array
=
var
->
GetMutable
<
framework
::
LoDTensorArray
>
();
auto
*
tensor_array
=
var
->
GetMutable
<
framework
::
LoDTensorArray
>
();
// Note: Here I want enforce `tensor_array->size() == 0UL`, because
// Note: Here I want enforce `tensor_array->size() == 0UL`, because
// inplace using on LoDTensorArray is dangerous, but the unittest
// inplace using on LoDTensorArray is dangerous, but the unittest
// `test_list` contains this behavior
// `test_list` contains this behavior
PADDLE_ENFORCE_EQ
(
dims
.
size
(),
1UL
,
PADDLE_ENFORCE_EQ
(
dims
.
size
(),
1UL
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"LoDTensorArray can only have one dimension."
));
"LoDTensorArray can only have one dimension."
));
// only set the array size for LoDTensorArray input
// only set the array size for LoDTensorArray input
tensor_array
->
resize
(
dims
[
0
]);
tensor_array
->
resize
(
dims
[
0
]);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can set dims from DenseTensor or SelectedRows."
));
}
}
else
{
}
else
{
auto
*
var
=
BOOST_GET
(
VarDesc
*
,
var_
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
var
->
SetShape
(
vectorize
(
dims
));
"Currently, only can set dims from DenseTensor or SelectedRows."
));
}
}
}
else
{
auto
*
var
=
BOOST_GET
(
VarDesc
*
,
var_
);
var
->
SetShape
(
vectorize
(
dims
));
}
}
}
void
set_dtype
(
phi
::
DataType
dtype
)
override
{
void
CompatMetaTensor
::
set_dtype
(
phi
::
DataType
dtype
)
{
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dtype
=
dtype
;
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dtype
=
dtype
;
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
phi
::
SelectedRows
>
()
->
mutable_value
();
auto
*
tensor
=
var
->
GetMutable
<
phi
::
SelectedRows
>
()
->
mutable_value
();
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dtype
=
dtype
;
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
dtype
=
dtype
;
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
// NOTE(chenweihang): do nothing
// NOTE(chenweihang): do nothing
// Unsupported set dtype for LoDTensorArray now
// Unsupported set dtype for LoDTensorArray now
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can set dtype from DenseTensor or SelectedRows."
));
}
}
else
{
}
else
{
auto
*
var
=
BOOST_GET
(
VarDesc
*
,
var_
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
var
->
SetDataType
(
paddle
::
framework
::
TransToProtoVarType
(
dtype
));
"Currently, only can set dtype from DenseTensor or SelectedRows."
));
}
}
}
else
{
auto
*
var
=
BOOST_GET
(
VarDesc
*
,
var_
);
var
->
SetDataType
(
paddle
::
framework
::
TransToProtoVarType
(
dtype
));
}
}
}
void
set_layout
(
DataLayout
layout
)
override
{
void
CompatMetaTensor
::
set_layout
(
DataLayout
layout
)
{
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
layout
=
layout
;
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
layout
=
layout
;
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
}
else
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
phi
::
SelectedRows
>
()
->
mutable_value
();
auto
*
tensor
=
var
->
GetMutable
<
phi
::
SelectedRows
>
()
->
mutable_value
();
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
layout
=
layout
;
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
layout
=
layout
;
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
}
else
if
(
var
->
IsType
<
framework
::
LoDTensorArray
>
())
{
// NOTE(chenweihang): do nothing
// Unsupported set dtype for LoDTensorArray now
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can set layout from DenseTensor or "
"SelectedRows."
));
}
}
else
{
// NOTE(chenweihang): do nothing
// NOTE(chenweihang): do nothing
// Unsupported set layout for VarDesc now
// Unsupported set dtype for LoDTensorArray now
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently, only can set layout from DenseTensor or "
"SelectedRows."
));
}
}
}
else
{
// NOTE(chenweihang): do nothing
// Unsupported set layout for VarDesc now
}
}
}
void
share_lod
(
const
MetaTensor
&
meta_tensor
)
override
{
void
CompatMetaTensor
::
share_lod
(
const
MetaTensor
&
meta_tensor
)
{
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
if
(
var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
phi
::
DenseTensor
>
();
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
lod
=
phi
::
DenseTensorUtils
::
GetMutableMeta
(
tensor
)
->
lod
=
static_cast
<
const
CompatMetaTensor
&>
(
meta_tensor
).
GetRuntimeLoD
();
static_cast
<
const
CompatMetaTensor
&>
(
meta_tensor
).
GetRuntimeLoD
();
}
else
{
// NOTE(chenweihang): do nothing
// only LoDTensor need to share lod
}
}
else
{
}
else
{
auto
*
var
=
BOOST_GET
(
VarDesc
*
,
var_
);
// NOTE(chenweihang): do nothing
var
->
SetLoDLevel
(
static_cast
<
const
CompatMetaTensor
&>
(
meta_tensor
)
// only LoDTensor need to share lod
.
GetCompileTimeLoD
());
}
}
}
else
{
auto
*
var
=
BOOST_GET
(
VarDesc
*
,
var_
);
var
->
SetLoDLevel
(
static_cast
<
const
CompatMetaTensor
&>
(
meta_tensor
).
GetCompileTimeLoD
());
}
}
}
void
share_dims
(
const
MetaTensor
&
meta_tensor
)
override
{
void
CompatMetaTensor
::
share_dims
(
const
MetaTensor
&
meta_tensor
)
{
set_dims
(
meta_tensor
.
dims
());
set_dims
(
meta_tensor
.
dims
());
if
(
is_runtime_
)
{
if
(
is_runtime_
)
{
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
auto
*
var
=
BOOST_GET
(
Variable
*
,
var_
);
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
if
(
var
->
IsType
<
phi
::
SelectedRows
>
())
{
auto
*
selected_rows
=
var
->
GetMutable
<
phi
::
SelectedRows
>
();
auto
*
selected_rows
=
var
->
GetMutable
<
phi
::
SelectedRows
>
();
auto
&
input_selected_rows
=
auto
&
input_selected_rows
=
static_cast
<
const
CompatMetaTensor
&>
(
meta_tensor
).
GetSelectedRows
();
static_cast
<
const
CompatMetaTensor
&>
(
meta_tensor
).
GetSelectedRows
();
selected_rows
->
set_rows
(
input_selected_rows
.
rows
());
selected_rows
->
set_rows
(
input_selected_rows
.
rows
());
selected_rows
->
set_height
(
input_selected_rows
.
height
());
selected_rows
->
set_height
(
input_selected_rows
.
height
());
}
}
}
}
}
}
void
share_meta
(
const
MetaTensor
&
meta_tensor
)
override
{
void
CompatMetaTensor
::
share_meta
(
const
MetaTensor
&
meta_tensor
)
{
share_dims
(
meta_tensor
);
share_dims
(
meta_tensor
);
set_dtype
(
meta_tensor
.
dtype
());
set_dtype
(
meta_tensor
.
dtype
());
set_layout
(
meta_tensor
.
layout
());
set_layout
(
meta_tensor
.
layout
());
// special case: share lod of LoDTensor
// special case: share lod of LoDTensor
share_lod
(
meta_tensor
);
share_lod
(
meta_tensor
);
}
}
private:
const
LoD
&
GetRuntimeLoD
()
const
{
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
return
var
->
Get
<
LoDTensor
>
().
lod
();
}
int32_t
GetCompileTimeLoD
()
const
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
return
var
->
GetLoDLevel
();
}
const
phi
::
SelectedRows
&
GetSelectedRows
()
const
{
PADDLE_ENFORCE_EQ
(
is_runtime_
,
true
,
platform
::
errors
::
Unavailable
(
"Only can get Tensor from MetaTensor in rumtime."
));
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
PADDLE_ENFORCE_EQ
(
var
->
IsType
<
phi
::
SelectedRows
>
(),
true
,
platform
::
errors
::
Unavailable
(
"The Tensor in MetaTensor is not SelectedRows."
));
return
var
->
Get
<
phi
::
SelectedRows
>
();
}
InferShapeVarPtr
var_
;
bool
is_runtime_
;
};
phi
::
InferMetaContext
BuildInferMetaContext
(
InferShapeContext
*
ctx
,
phi
::
InferMetaContext
BuildInferMetaContext
(
InferShapeContext
*
ctx
,
const
std
::
string
&
op_type
)
{
const
std
::
string
&
op_type
)
{
...
...
paddle/fluid/framework/infershape_utils.h
浏览文件 @
31363c3f
...
@@ -18,7 +18,7 @@ limitations under the License. */
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/phi/core/meta_tensor.h"
namespace
phi
{
namespace
phi
{
class
InferMetaContext
;
class
InferMetaContext
;
}
// namespace phi
}
// namespace phi
...
@@ -39,5 +39,63 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
...
@@ -39,5 +39,63 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
} \
} \
}
}
// TODO(chenweihang): Support TensorArray later
class
CompatMetaTensor
:
public
phi
::
MetaTensor
{
public:
CompatMetaTensor
(
InferShapeVarPtr
var
,
bool
is_runtime
)
:
var_
(
std
::
move
(
var
)),
is_runtime_
(
is_runtime
)
{}
CompatMetaTensor
()
=
default
;
CompatMetaTensor
(
const
CompatMetaTensor
&
)
=
default
;
CompatMetaTensor
(
CompatMetaTensor
&&
)
=
default
;
CompatMetaTensor
&
operator
=
(
const
CompatMetaTensor
&
)
=
delete
;
CompatMetaTensor
&
operator
=
(
CompatMetaTensor
&&
)
=
delete
;
int64_t
numel
()
const
override
;
DDim
dims
()
const
override
;
phi
::
DataType
dtype
()
const
override
;
DataLayout
layout
()
const
override
;
void
set_dims
(
const
DDim
&
dims
)
override
;
void
set_dtype
(
phi
::
DataType
dtype
)
override
;
void
set_layout
(
DataLayout
layout
)
override
;
void
share_lod
(
const
MetaTensor
&
meta_tensor
)
override
;
void
share_dims
(
const
MetaTensor
&
meta_tensor
)
override
;
void
share_meta
(
const
MetaTensor
&
meta_tensor
)
override
;
private:
const
LoD
&
GetRuntimeLoD
()
const
{
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
return
var
->
Get
<
LoDTensor
>
().
lod
();
}
int32_t
GetCompileTimeLoD
()
const
{
auto
*
var
=
BOOST_GET_CONST
(
VarDesc
*
,
var_
);
return
var
->
GetLoDLevel
();
}
const
phi
::
SelectedRows
&
GetSelectedRows
()
const
{
PADDLE_ENFORCE_EQ
(
is_runtime_
,
true
,
platform
::
errors
::
Unavailable
(
"Only can get Tensor from MetaTensor in rumtime."
));
auto
*
var
=
BOOST_GET_CONST
(
Variable
*
,
var_
);
PADDLE_ENFORCE_EQ
(
var
->
IsType
<
phi
::
SelectedRows
>
(),
true
,
platform
::
errors
::
Unavailable
(
"The Tensor in MetaTensor is not SelectedRows."
));
return
var
->
Get
<
phi
::
SelectedRows
>
();
}
InferShapeVarPtr
var_
;
bool
is_runtime_
;
};
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/new_executor/workqueue/workqueue.h
浏览文件 @
31363c3f
...
@@ -15,9 +15,12 @@
...
@@ -15,9 +15,12 @@
#pragma once
#pragma once
#include <functional>
#include <functional>
#include <future>
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <type_traits>
#include <vector>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -25,6 +28,29 @@ namespace framework {
...
@@ -25,6 +28,29 @@ namespace framework {
constexpr
const
char
*
kQueueEmptyEvent
=
"QueueEmpty"
;
constexpr
const
char
*
kQueueEmptyEvent
=
"QueueEmpty"
;
constexpr
const
char
*
kQueueDestructEvent
=
"QueueDestruct"
;
constexpr
const
char
*
kQueueDestructEvent
=
"QueueDestruct"
;
// For std::function
// https://stackoverflow.com/questions/25421346/how-to-create-an-stdfunction-from-a-move-capturing-lambda-expression
template
<
typename
OnlyMovable
>
class
FakeCopyable
{
public:
explicit
FakeCopyable
(
OnlyMovable
&&
obj
)
:
obj_
(
std
::
move
(
obj
))
{
static_assert
(
std
::
is_copy_constructible
<
OnlyMovable
>::
value
==
false
,
"Need not to use FakeCopyable"
);
}
FakeCopyable
(
FakeCopyable
&&
other
)
:
obj_
(
std
::
move
(
other
.
obj_
))
{}
FakeCopyable
(
const
FakeCopyable
&
other
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Never use the copy constructor of FakeCopyable."
));
}
OnlyMovable
&
Get
()
{
return
obj_
;
}
private:
OnlyMovable
obj_
;
};
class
EventsWaiter
;
class
EventsWaiter
;
struct
WorkQueueOptions
{
struct
WorkQueueOptions
{
...
@@ -78,6 +104,22 @@ class WorkQueue {
...
@@ -78,6 +104,22 @@ class WorkQueue {
virtual
void
AddTask
(
std
::
function
<
void
()
>
fn
)
=
0
;
virtual
void
AddTask
(
std
::
function
<
void
()
>
fn
)
=
0
;
// Higher cost than AddTask
template
<
typename
F
,
typename
...
Args
>
std
::
future
<
typename
std
::
result_of
<
F
(
Args
...)
>::
type
>
AddAwaitableTask
(
F
&&
f
,
Args
&&
...
args
)
{
using
ReturnType
=
typename
std
::
result_of
<
F
(
Args
...)
>::
type
;
std
::
function
<
ReturnType
()
>
task
=
std
::
bind
(
std
::
forward
<
F
>
(
f
),
std
::
forward
<
Args
>
(
args
)...);
std
::
promise
<
ReturnType
>
prom
;
std
::
future
<
ReturnType
>
res
=
prom
.
get_future
();
AddTask
([
t
=
std
::
move
(
task
),
p
=
FakeCopyable
<
std
::
promise
<
ReturnType
>>
(
std
::
move
(
prom
))
]()
mutable
{
p
.
Get
().
set_value
(
t
());
});
return
res
;
}
// See WorkQueueOptions.track_task for details
// See WorkQueueOptions.track_task for details
// virtual void WaitQueueEmpty() = 0;
// virtual void WaitQueueEmpty() = 0;
...
@@ -102,6 +144,22 @@ class WorkQueueGroup {
...
@@ -102,6 +144,22 @@ class WorkQueueGroup {
virtual
void
AddTask
(
size_t
queue_idx
,
std
::
function
<
void
()
>
fn
)
=
0
;
virtual
void
AddTask
(
size_t
queue_idx
,
std
::
function
<
void
()
>
fn
)
=
0
;
// Higher cost than AddTask
template
<
typename
F
,
typename
...
Args
>
std
::
future
<
typename
std
::
result_of
<
F
(
Args
...)
>::
type
>
AddAwaitableTask
(
size_t
queue_idx
,
F
&&
f
,
Args
&&
...
args
)
{
using
ReturnType
=
typename
std
::
result_of
<
F
(
Args
...)
>::
type
;
std
::
function
<
ReturnType
()
>
task
=
std
::
bind
(
std
::
forward
<
F
>
(
f
),
std
::
forward
<
Args
>
(
args
)...);
std
::
promise
<
ReturnType
>
prom
;
std
::
future
<
ReturnType
>
res
=
prom
.
get_future
();
AddTask
(
queue_idx
,
[
t
=
std
::
move
(
task
),
p
=
FakeCopyable
<
std
::
promise
<
ReturnType
>>
(
std
::
move
(
prom
))
]()
mutable
{
p
.
Get
().
set_value
(
t
());
});
return
res
;
}
// See WorkQueueOptions.track_task for details
// See WorkQueueOptions.track_task for details
// virtual void WaitQueueGroupEmpty() = 0;
// virtual void WaitQueueGroupEmpty() = 0;
...
...
paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
浏览文件 @
31363c3f
...
@@ -60,11 +60,13 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
...
@@ -60,11 +60,13 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
}
}
finished
=
true
;
finished
=
true
;
});
});
auto
handle
=
work_queue
->
AddAwaitableTask
([]()
{
return
1234
;
});
// WaitQueueEmpty
// WaitQueueEmpty
EXPECT_EQ
(
finished
.
load
(),
false
);
EXPECT_EQ
(
finished
.
load
(),
false
);
events_waiter
.
WaitEvent
();
events_waiter
.
WaitEvent
();
EXPECT_EQ
(
finished
.
load
(),
true
);
EXPECT_EQ
(
finished
.
load
(),
true
);
EXPECT_EQ
(
counter
.
load
(),
kLoopNum
);
EXPECT_EQ
(
counter
.
load
(),
kLoopNum
);
EXPECT_EQ
(
handle
.
get
(),
1234
);
}
}
TEST
(
WorkQueue
,
TestMultiThreadedWorkQueue
)
{
TEST
(
WorkQueue
,
TestMultiThreadedWorkQueue
)
{
...
@@ -146,6 +148,9 @@ TEST(WorkQueue, TestWorkQueueGroup) {
...
@@ -146,6 +148,9 @@ TEST(WorkQueue, TestWorkQueueGroup) {
++
counter
;
++
counter
;
}
}
});
});
int
random_num
=
123456
;
auto
handle
=
queue_group
->
AddAwaitableTask
(
1
,
[
random_num
]()
{
return
random_num
;
});
// WaitQueueGroupEmpty
// WaitQueueGroupEmpty
events_waiter
.
WaitEvent
();
events_waiter
.
WaitEvent
();
EXPECT_EQ
(
counter
.
load
(),
kLoopNum
*
kExternalLoopNum
+
kLoopNum
);
EXPECT_EQ
(
counter
.
load
(),
kLoopNum
*
kExternalLoopNum
+
kLoopNum
);
...
@@ -154,4 +159,5 @@ TEST(WorkQueue, TestWorkQueueGroup) {
...
@@ -154,4 +159,5 @@ TEST(WorkQueue, TestWorkQueueGroup) {
events_waiter
.
WaitEvent
();
events_waiter
.
WaitEvent
();
queue_group
.
reset
();
queue_group
.
reset
();
EXPECT_EQ
(
events_waiter
.
WaitEvent
(),
paddle
::
framework
::
kQueueDestructEvent
);
EXPECT_EQ
(
events_waiter
.
WaitEvent
(),
paddle
::
framework
::
kQueueDestructEvent
);
EXPECT_EQ
(
handle
.
get
(),
random_num
);
}
}
paddle/fluid/operators/deformable_conv_func.h
已删除
100644 → 0
浏览文件 @
da478d1e
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
template
<
typename
T
>
HOSTDEVICE
T
DmcnGetGradientWeight
(
T
argmax_h
,
T
argmax_w
,
const
int
h
,
const
int
w
,
const
int
height
,
const
int
width
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
T
weight
=
0
;
weight
=
(
h
==
argmax_h_low
&&
w
==
argmax_w_low
)
?
(
h
+
1
-
argmax_h
)
*
(
w
+
1
-
argmax_w
)
:
weight
;
weight
=
(
h
==
argmax_h_low
&&
w
==
argmax_w_high
)
?
(
h
+
1
-
argmax_h
)
*
(
argmax_w
+
1
-
w
)
:
weight
;
weight
=
(
h
==
argmax_h_high
&&
w
==
argmax_w_low
)
?
(
argmax_h
+
1
-
h
)
*
(
w
+
1
-
argmax_w
)
:
weight
;
weight
=
(
h
==
argmax_h_high
&&
w
==
argmax_w_high
)
?
(
argmax_h
+
1
-
h
)
*
(
argmax_w
+
1
-
w
)
:
weight
;
return
weight
;
}
template
<
typename
T
>
HOSTDEVICE
T
DmcnGetCoordinateWeight
(
T
argmax_h
,
T
argmax_w
,
const
int
height
,
const
int
width
,
const
T
*
im_data
,
const
int
data_width
,
const
int
bp_dir
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
T
weight
=
0
;
if
(
bp_dir
==
0
)
{
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
?
-
1
*
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
?
-
1
*
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
?
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
?
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
]
:
0
;
}
else
if
(
bp_dir
==
1
)
{
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
?
-
1
*
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
?
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
?
-
1
*
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
?
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
]
:
0
;
}
return
weight
;
}
template
<
typename
T
>
HOSTDEVICE
T
DmcnIm2colBilinear
(
const
T
*
bottom_data
,
const
int
data_width
,
const
int
height
,
const
int
width
,
T
h
,
T
w
)
{
int
h_low
=
floor
(
h
);
int
w_low
=
floor
(
w
);
int
h_high
=
h_low
+
1
;
int
w_high
=
w_low
+
1
;
T
lh
=
h
-
h_low
;
T
lw
=
w
-
w_low
;
T
hh
=
1
-
lh
;
T
hw
=
1
-
lw
;
T
v1
=
(
h_low
>=
0
&&
w_low
>=
0
)
?
bottom_data
[
h_low
*
data_width
+
w_low
]
:
0
;
T
v2
=
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
?
bottom_data
[
h_low
*
data_width
+
w_high
]
:
0
;
T
v3
=
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
?
bottom_data
[
h_high
*
data_width
+
w_low
]
:
0
;
T
v4
=
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
?
bottom_data
[
h_high
*
data_width
+
w_high
]
:
0
;
T
w1
=
hh
*
hw
;
T
w2
=
hh
*
lw
;
T
w3
=
lh
*
hw
;
T
w4
=
lh
*
lw
;
return
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
;
}
paddle/fluid/operators/deformable_conv_op.cc
浏览文件 @
31363c3f
...
@@ -12,9 +12,11 @@
...
@@ -12,9 +12,11 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/deformable_conv_op.h"
#include <memory>
#include <memory>
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -108,158 +110,6 @@ $$
...
@@ -108,158 +110,6 @@ $$
class
DeformableConvOp
:
public
framework
::
OperatorWithKernel
{
class
DeformableConvOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Input"
),
"Input"
,
"Input"
,
"deformable_conv"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Offset"
),
"Input"
,
"Offset"
,
"deformable_conv)"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Mask"
),
"Input"
,
"Mask"
,
"deformable_conv"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Filter"
),
"Input"
,
"Filter"
,
"deformable_conv"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Output"
),
"Output"
,
"Output"
,
"deformable_conv"
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
auto
filter_dims
=
ctx
->
GetInputDim
(
"Filter"
);
auto
offset_dims
=
ctx
->
GetInputDim
(
"Offset"
);
auto
mask_dims
=
ctx
->
GetInputDim
(
"Mask"
);
std
::
vector
<
int
>
strides
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
->
Attrs
().
Get
<
int
>
(
"groups"
);
int
deformable_groups
=
ctx
->
Attrs
().
Get
<
int
>
(
"deformable_groups"
);
int
im2col_step
=
ctx
->
Attrs
().
Get
<
int
>
(
"im2col_step"
);
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
4
,
platform
::
errors
::
InvalidArgument
(
"Conv input should be 4-D tensor, get %u"
,
in_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
filter_dims
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Conv input dimension and filter dimension should be "
"the same. The difference is [%d]: [%d]"
,
in_dims
.
size
(),
filter_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
.
size
()
-
strides
.
size
(),
2U
,
platform
::
errors
::
InvalidArgument
(
"Conv input dimension and strides "
"dimension should be consistent. But received input "
"dimension:[%d], strides dimension:[%d]"
,
in_dims
.
size
(),
strides
.
size
()));
PADDLE_ENFORCE_EQ
(
paddings
.
size
(),
strides
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Conv paddings dimension and Conv strides dimension "
"should be the same. The difference is [%d]: [%d]"
,
paddings
.
size
(),
strides
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
[
1
],
filter_dims
[
1
]
*
groups
,
platform
::
errors
::
InvalidArgument
(
"The number of input channels should be equal to filter "
"channels * groups. The difference is [%d]: [%d]"
,
in_dims
[
1
],
filter_dims
[
1
]
*
groups
));
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
]
%
groups
,
0
,
platform
::
errors
::
InvalidArgument
(
"The number of output channels should be divided by groups. But "
"received output channels:[%d], groups:[%d]"
,
filter_dims
[
0
],
groups
));
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
]
%
deformable_groups
,
0
,
platform
::
errors
::
InvalidArgument
(
"The number of output channels should be "
"divided by deformable groups. The difference is [%d]: [%d]"
,
filter_dims
[
0
]
%
groups
,
0
));
if
(
in_dims
[
0
]
>
im2col_step
)
{
PADDLE_ENFORCE_EQ
(
in_dims
[
0
]
%
im2col_step
,
0U
,
platform
::
errors
::
InvalidArgument
(
"Input batchsize must be smaller than or divide im2col_step. But "
"received Input batchsize:[%d], im2col_step:[%d]"
,
in_dims
[
0
],
im2col_step
));
}
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
PADDLE_ENFORCE_GT
(
strides
[
i
],
0U
,
platform
::
errors
::
InvalidArgument
(
"stride %d size incorrect"
,
i
));
}
for
(
size_t
i
=
0
;
i
<
dilations
.
size
();
++
i
)
{
PADDLE_ENFORCE_GT
(
dilations
[
i
],
0U
,
platform
::
errors
::
InvalidArgument
(
"dilation %d size incorrect"
,
i
));
}
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
if
((
!
ctx
->
IsRuntime
())
&&
(
in_dims
[
i
+
2
]
<=
0
||
filter_dims
[
i
+
2
]
<=
0
))
{
output_shape
.
push_back
(
-
1
);
}
else
{
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
}
PADDLE_ENFORCE_EQ
(
output_shape
[
1
]
%
deformable_groups
,
0U
,
platform
::
errors
::
InvalidArgument
(
"output num_filter must divide deformable group size. But received "
"output num_filter:[%d], deformable group size:[%d]"
,
output_shape
[
1
],
deformable_groups
));
if
(
ctx
->
IsRuntime
())
{
PADDLE_ENFORCE_EQ
(
output_shape
[
2
],
offset_dims
[
2
],
platform
::
errors
::
InvalidArgument
(
"output height must equal to offset map height. "
"The difference is [%d]: [%d]"
,
output_shape
[
2
],
offset_dims
[
2
]));
PADDLE_ENFORCE_EQ
(
output_shape
[
3
],
offset_dims
[
3
],
platform
::
errors
::
InvalidArgument
(
"output width must equal to offset map width. The "
"difference is [%d]: [%d]"
,
output_shape
[
3
],
offset_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
offset_dims
[
1
]
%
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
0U
,
platform
::
errors
::
InvalidArgument
(
"offset filter must divide deformable group size. "
"But received [%d]: [%d]"
,
offset_dims
[
1
],
filter_dims
[
2
]
*
filter_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
offset_dims
[
1
]
/
(
2
*
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
,
platform
::
errors
::
InvalidArgument
(
"offset filter must divide deformable group size. But received "
"[%d]: [%d]"
,
offset_dims
[
1
]
/
(
2
*
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
));
PADDLE_ENFORCE_EQ
(
output_shape
[
2
],
mask_dims
[
2
],
platform
::
errors
::
InvalidArgument
(
"output height must equal to mask map height. The "
"difference is [%d] vs [%d]"
,
output_shape
[
2
],
mask_dims
[
2
]));
PADDLE_ENFORCE_EQ
(
output_shape
[
3
],
mask_dims
[
3
],
platform
::
errors
::
InvalidArgument
(
"output width must equal to mask map width. The "
"difference is [%d] vs [%d]"
,
output_shape
[
3
],
mask_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
mask_dims
[
1
]
%
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
0U
,
platform
::
errors
::
InvalidArgument
(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]"
,
mask_dims
[
1
],
filter_dims
[
2
]
*
filter_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
mask_dims
[
1
]
/
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
,
platform
::
errors
::
InvalidArgument
(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]"
,
mask_dims
[
1
]
/
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
));
}
ctx
->
SetOutputDim
(
"Output"
,
phi
::
make_ddim
(
output_shape
));
}
protected:
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
...
@@ -331,13 +181,13 @@ class DeformableConvGradOp : public framework::OperatorWithKernel {
...
@@ -331,13 +181,13 @@ class DeformableConvGradOp : public framework::OperatorWithKernel {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
DECLARE_INFER_SHAPE_FUNCTOR
(
deformable_conv
,
DeformableConvInferShapeFunctor
,
PD_INFER_META
(
phi
::
DeformableConvInferMeta
));
REGISTER_OPERATOR
(
deformable_conv
,
ops
::
DeformableConvOp
,
REGISTER_OPERATOR
(
deformable_conv
,
ops
::
DeformableConvOp
,
ops
::
DeformableConvOpMaker
,
ops
::
DeformableConvOpMaker
,
ops
::
DeformableConvGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
DeformableConvGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
DeformableConvGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
ops
::
DeformableConvGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
DeformableConvInferShapeFunctor
);
REGISTER_OPERATOR
(
deformable_conv_grad
,
ops
::
DeformableConvGradOp
);
REGISTER_OPERATOR
(
deformable_conv_grad
,
ops
::
DeformableConvGradOp
);
REGISTER_OP_CPU_KERNEL
(
deformable_conv_grad
,
ops
::
DeformableConvGradCPUKernel
<
float
>
,
ops
::
DeformableConvGradCPUKernel
<
double
>
);
paddle/fluid/operators/deformable_conv_op.cu
已删除
100644 → 0
浏览文件 @
da478d1e
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/DCNv2_op/nn/modulated_deformable_im2col.cuh
//
// Copyright (c) 2018 Microsoft
// Licensed under The MIT License [see LICENSE for details]
// \file modulated_deformable_im2col.cuh
// \brief
// \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaximumNumBlocks
=
4096
;
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaximumNumBlocks
);
}
template
<
typename
T
>
__device__
T
DmcnGetGradientWeight
(
T
argmax_h
,
T
argmax_w
,
const
int
h
,
const
int
w
,
const
int
height
,
const
int
width
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
T
weight
=
0
;
if
(
h
==
argmax_h_low
&&
w
==
argmax_w_low
)
weight
=
(
h
+
1
-
argmax_h
)
*
(
w
+
1
-
argmax_w
);
if
(
h
==
argmax_h_low
&&
w
==
argmax_w_high
)
weight
=
(
h
+
1
-
argmax_h
)
*
(
argmax_w
+
1
-
w
);
if
(
h
==
argmax_h_high
&&
w
==
argmax_w_low
)
weight
=
(
argmax_h
+
1
-
h
)
*
(
w
+
1
-
argmax_w
);
if
(
h
==
argmax_h_high
&&
w
==
argmax_w_high
)
weight
=
(
argmax_h
+
1
-
h
)
*
(
argmax_w
+
1
-
w
);
return
weight
;
}
template
<
typename
T
>
__global__
void
ModulatedDeformableCol2imGpuKernel
(
const
int
nthreads
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_im
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
thread
=
index
;
thread
<
nthreads
;
thread
+=
offset
)
{
const
int
j
=
(
thread
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
thread
%
width_col
;
int
h_out
=
(
thread
/
width_col
)
%
height_col
;
int
b
=
(
thread
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
const
T
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
const
T
cur_top_grad
=
data_col
[
thread
]
*
mask
;
const
int
cur_h
=
static_cast
<
int
>
(
cur_inv_h_data
);
const
int
cur_w
=
static_cast
<
int
>
(
cur_inv_w_data
);
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
T
weight
=
DmcnGetGradientWeight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
platform
::
CudaAtomicAdd
(
grad_im
+
cur_bottom_grad_pos
,
weight
*
cur_top_grad
);
}
}
}
}
}
template
<
typename
T
>
inline
void
ModulatedDeformableCol2im
(
const
platform
::
DeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
pad
,
const
std
::
vector
<
int
>
stride
,
const
std
::
vector
<
int
>
dilation
,
const
int
deformable_group
,
T
*
grad_im
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_group
;
int
num_kernels
=
col_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
int
blocks
=
NumBlocks
(
num_kernels
);
int
threads
=
kNumCUDAThreads
;
ModulatedDeformableCol2imGpuKernel
<
T
><<<
blocks
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
()
>>>
(
num_kernels
,
data_col
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
pad
[
0
],
pad
[
1
],
stride
[
0
],
stride
[
1
],
dilation
[
0
],
dilation
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
deformable_group
,
col_shape
[
2
],
col_shape
[
3
],
grad_im
);
}
template
<
typename
T
>
__device__
T
DmcnGetCoordinateWeight
(
T
argmax_h
,
T
argmax_w
,
const
int
height
,
const
int
width
,
const
T
*
im_data
,
const
int
data_width
,
const
int
bp_dir
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
T
weight
=
0
;
if
(
bp_dir
==
0
)
{
if
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
];
if
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
weight
+=
-
1
*
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
weight
+=
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
];
}
else
if
(
bp_dir
==
1
)
{
if
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
];
if
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
];
}
return
weight
;
}
template
<
typename
T
>
__device__
T
DmcnIm2colBilinear
(
const
T
*
bottom_data
,
const
int
data_width
,
const
int
height
,
const
int
width
,
T
h
,
T
w
)
{
int
h_low
=
floor
(
h
);
int
w_low
=
floor
(
w
);
int
h_high
=
h_low
+
1
;
int
w_high
=
w_low
+
1
;
T
lh
=
h
-
h_low
;
T
lw
=
w
-
w_low
;
T
hh
=
1
-
lh
,
hw
=
1
-
lw
;
T
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
v1
=
bottom_data
[
h_low
*
data_width
+
w_low
];
T
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
v2
=
bottom_data
[
h_low
*
data_width
+
w_high
];
T
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
v3
=
bottom_data
[
h_high
*
data_width
+
w_low
];
T
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
v4
=
bottom_data
[
h_high
*
data_width
+
w_high
];
T
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
T
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
T
>
__global__
void
ModulatedDeformableCol2imCoordGpuKernel
(
const
int
nthreads
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_offset
,
T
*
grad_mask
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
T
val
=
0
,
mval
=
0
;
const
int
w
=
i
%
width_col
;
const
int
h
=
(
i
/
width_col
)
%
height_col
;
const
int
c
=
(
i
/
width_col
/
height_col
)
%
offset_channels
;
const
int
b
=
(
i
/
width_col
/
height_col
)
/
offset_channels
;
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
T
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
offset_c
/
2
;
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_mask_hw_ptr
=
(((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
T
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
T
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
else
{
mval
+=
data_col_ptr
[
col_pos
]
*
DmcnIm2colBilinear
(
data_im_ptr
+
cnt
*
height
*
width
,
width
,
height
,
width
,
inv_h
,
inv_w
);
}
const
T
weight
=
DmcnGetCoordinateWeight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
val
+=
weight
*
data_col_ptr
[
col_pos
]
*
mask
;
cnt
+=
1
;
}
grad_offset
[
i
]
=
val
;
if
(
offset_c
%
2
==
0
)
grad_mask
[(((
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
+
offset_c
/
2
)
*
height_col
+
h
)
*
width_col
+
w
]
=
mval
;
}
}
template
<
typename
T
>
inline
void
ModulatedDeformableCol2imCoord
(
const
platform
::
DeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
grad_offset
,
T
*
grad_mask
)
{
int
num_kernels
=
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
]
*
deformable_groups
;
int
channel_per_deformable_group
=
col_shape
[
0
]
/
deformable_groups
;
int
blocks
=
NumBlocks
(
num_kernels
);
int
threads
=
kNumCUDAThreads
;
ModulatedDeformableCol2imCoordGpuKernel
<
T
><<<
blocks
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
()
>>>
(
num_kernels
,
data_col
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
deformable_groups
,
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
grad_offset
,
grad_mask
);
}
template
<
typename
T
>
__global__
void
ModulatedDeformableIm2colGpuKernel
(
const
int
nthreads
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
*
mask
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
>
inline
void
ModulatedDeformableIm2col
(
const
platform
::
DeviceContext
&
ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
filter_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
int
blocks
=
NumBlocks
(
num_kernels
);
int
threads
=
kNumCUDAThreads
;
ModulatedDeformableIm2colGpuKernel
<
T
><<<
blocks
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
()
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
template
<
typename
T
>
__global__
void
FilterGradAddupGpuKernel
(
const
int
nthreads
,
const
int
n
,
const
int
height
,
const
int
width
,
const
T
*
dweight_3d
,
T
*
filter_grad
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
filter_grad
[
i
]
=
filter_grad
[
i
]
+
dweight_3d
[
i
];
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
DeformableConvGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
Tensor
*
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
Tensor
*
offset_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Offset"
));
Tensor
*
mask_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Mask"
));
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
Tensor
offset
=
*
ctx
.
Input
<
Tensor
>
(
"Offset"
);
Tensor
mask
=
*
ctx
.
Input
<
Tensor
>
(
"Mask"
);
Tensor
filter
=
*
ctx
.
Input
<
Tensor
>
(
"Filter"
);
if
(
!
input_grad
&&
!
filter_grad
&&
!
offset_grad
&&
!
mask_grad
)
return
;
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
framework
::
DDim
input_shape
=
phi
::
slice_ddim
(
input
->
dims
(),
1
,
input
->
dims
().
size
());
std
::
vector
<
int64_t
>
input_shape_vec
=
phi
::
vectorize
(
input_shape
);
std
::
vector
<
int64_t
>
filter_shape_vec
(
phi
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
phi
::
vectorize
(
output_grad
->
dims
()));
std
::
vector
<
int64_t
>
col_buffer_shape_vec
(
filter_shape_vec
.
size
());
col_buffer_shape_vec
[
0
]
=
input
->
dims
()[
1
]
*
filter
.
dims
()[
2
]
*
filter
.
dims
()[
3
];
col_buffer_shape_vec
[
1
]
=
im2col_step
;
for
(
size_t
j
=
0
;
j
<
filter_shape_vec
.
size
()
-
2
;
++
j
)
{
col_buffer_shape_vec
[
j
+
2
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
phi
::
make_ddim
(
col_buffer_shape_vec
));
std
::
vector
<
int64_t
>
output_buffer_shape_vec
(
1
);
output_buffer_shape_vec
[
0
]
=
batch_size
*
output_shape_vec
[
1
]
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
framework
::
DDim
output_shape
(
phi
::
make_ddim
(
output_buffer_shape_vec
));
Tensor
col_buffer
;
Tensor
output_buffer
;
col_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
col_shape
,
dev_ctx
);
output_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
output_shape
,
dev_ctx
);
output_buffer
.
ShareDataWith
(
*
output_grad
);
int64_t
M
=
input_shape_vec
[
0
]
/
groups
*
filter_shape_vec
[
2
]
*
filter_shape_vec
[
3
];
int64_t
N
=
im2col_step
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
int64_t
K
=
output_shape_vec
[
1
]
/
groups
;
framework
::
DDim
weight_3d_shape
=
{
groups
,
K
,
M
};
framework
::
DDim
out_grad_4d_shape
=
{
batch_size
/
im2col_step
,
groups
,
K
,
N
};
framework
::
DDim
col_buffer_3d_shape
=
{
groups
,
M
,
N
};
framework
::
DDim
filter_grad_shape
=
{
groups
,
K
,
M
};
Tensor
weight_3d
;
weight_3d
.
ShareDataWith
(
filter
).
Resize
(
weight_3d_shape
);
Tensor
out_grad_4d
;
out_grad_4d
.
ShareDataWith
(
output_buffer
).
Resize
(
out_grad_4d_shape
);
Tensor
col_buffer_3d
;
col_buffer_3d
.
ShareDataWith
(
col_buffer
).
Resize
(
col_buffer_3d_shape
);
phi
::
funcs
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
col_buffer
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
col_buffer_3d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out_grad_4d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
input_dim
=
input
->
numel
()
/
input
->
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
int
input_mask_dim
=
mask
.
numel
()
/
mask
.
dims
()[
0
];
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
filter_grad
->
Resize
(
filter_grad_shape
);
set_zero
(
dev_ctx
,
filter_grad
,
static_cast
<
T
>
(
0
));
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
}
if
(
offset_grad
&&
mask_grad
)
{
offset_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mask_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
offset_grad
,
static_cast
<
T
>
(
0
));
set_zero
(
dev_ctx
,
mask_grad
,
static_cast
<
T
>
(
0
));
}
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
Tensor
out_grad_3d
=
out_grad_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_4d
.
dims
(),
1
,
out_grad_4d
.
dims
().
size
()));
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
weight_3d_slice
=
weight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
weight_3d
.
dims
(),
1
,
weight_3d
.
dims
().
size
()));
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
blas
.
MatMul
(
weight_3d_slice
,
true
,
out_grad_3d_slice
,
false
,
T
(
1.0
),
&
col_buffer_3d_slice
,
T
(
0.0
));
}
col_buffer
.
Resize
(
col_shape
);
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
const
T
*
mask_ptr
=
mask
.
data
<
T
>
();
if
(
mask_grad
&&
offset_grad
)
{
T
*
offset_grad_ptr
=
offset_grad
->
data
<
T
>
();
T
*
mask_grad_ptr
=
mask_grad
->
data
<
T
>
();
ModulatedDeformableCol2imCoord
(
ctx
.
device_context
(),
col_buffer_ptr
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
offset_grad_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_grad_ptr
+
i
*
im2col_step
*
input_mask_dim
);
}
if
(
input_grad
)
{
T
*
input_grad_ptr
=
input_grad
->
data
<
T
>
();
ModulatedDeformableCol2im
(
ctx
.
device_context
(),
col_buffer_ptr
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
input_grad_ptr
+
i
*
im2col_step
*
input_dim
);
input_grad
->
Resize
(
input
->
dims
());
}
ModulatedDeformableIm2col
(
ctx
.
device_context
(),
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
col_buffer_3d
.
Resize
(
col_buffer_3d_shape
);
if
(
filter_grad
)
{
Tensor
dweight_3d
;
dweight_3d
=
ctx
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
filter_grad_shape
,
dev_ctx
);
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
Tensor
dweight_3d_slice
=
dweight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
dweight_3d
.
dims
(),
1
,
dweight_3d
.
dims
().
size
()));
blas
.
MatMul
(
out_grad_3d_slice
,
false
,
col_buffer_3d_slice
,
true
,
T
(
1.0
),
&
dweight_3d_slice
,
T
(
0.0
));
}
FilterGradAddupGpuKernel
<
T
><<<
NumBlocks
(
dweight_3d
.
numel
()),
kNumCUDAThreads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
dweight_3d
.
numel
(),
groups
,
K
,
M
,
dweight_3d
.
data
<
T
>
(),
filter_grad
->
data
<
T
>
());
}
}
if
(
filter_grad
)
{
filter_grad
->
Resize
(
filter
.
dims
());
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
using
CUDA
=
paddle
::
platform
::
CUDADeviceContext
;
REGISTER_OP_CUDA_KERNEL
(
deformable_conv_grad
,
ops
::
DeformableConvGradCUDAKernel
<
CUDA
,
float
>
,
ops
::
DeformableConvGradCUDAKernel
<
CUDA
,
double
>
);
paddle/fluid/operators/deformable_conv_op.h
已删除
100644 → 0
浏览文件 @
da478d1e
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
CPUDeviceContext
=
platform
::
CPUDeviceContext
;
template
<
typename
T
>
void
ModulatedDeformableCol2imCPUKernel
(
const
int
num_kernels
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_im
)
{
for
(
int
thread
=
0
;
thread
<
num_kernels
;
thread
++
)
{
const
int
j
=
(
thread
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
thread
%
width_col
;
int
h_out
=
(
thread
/
width_col
)
%
height_col
;
int
b
=
(
thread
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
const
T
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
const
T
cur_top_grad
=
data_col
[
thread
]
*
mask
;
const
int
cur_h
=
static_cast
<
int
>
(
cur_inv_h_data
);
const
int
cur_w
=
static_cast
<
int
>
(
cur_inv_w_data
);
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
T
weight
=
DmcnGetGradientWeight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
*
(
grad_im
+
cur_bottom_grad_pos
)
=
*
(
grad_im
+
cur_bottom_grad_pos
)
+
weight
*
cur_top_grad
;
}
}
}
}
}
template
<
typename
T
>
static
inline
void
ModulatedDeformableCol2imCPU
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
pad
,
const
std
::
vector
<
int
>
stride
,
const
std
::
vector
<
int
>
dilation
,
const
int
deformable_group
,
T
*
grad_im
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_group
;
int
num_kernels
=
col_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
ModulatedDeformableCol2imCPUKernel
(
num_kernels
,
data_col
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
pad
[
0
],
pad
[
1
],
stride
[
0
],
stride
[
1
],
dilation
[
0
],
dilation
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
deformable_group
,
col_shape
[
2
],
col_shape
[
3
],
grad_im
);
}
template
<
typename
T
>
void
ModulatedDeformableCol2imCoordCPUKernel
(
const
int
num_kernels
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_offset
,
T
*
grad_mask
)
{
for
(
int
i
=
0
;
i
<
num_kernels
;
i
++
)
{
T
val
=
0
,
mval
=
0
;
const
int
w
=
i
%
width_col
;
const
int
h
=
(
i
/
width_col
)
%
height_col
;
const
int
c
=
(
i
/
width_col
/
height_col
)
%
offset_channels
;
const
int
b
=
(
i
/
width_col
/
height_col
)
/
offset_channels
;
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
T
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
offset_c
/
2
;
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_mask_hw_ptr
=
(((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
T
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
T
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
else
{
mval
+=
data_col_ptr
[
col_pos
]
*
DmcnIm2colBilinear
(
data_im_ptr
+
cnt
*
height
*
width
,
width
,
height
,
width
,
inv_h
,
inv_w
);
}
const
T
weight
=
DmcnGetCoordinateWeight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
val
+=
weight
*
data_col_ptr
[
col_pos
]
*
mask
;
cnt
+=
1
;
}
grad_offset
[
i
]
=
val
;
if
(
offset_c
%
2
==
0
)
grad_mask
[(((
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
+
offset_c
/
2
)
*
height_col
+
h
)
*
width_col
+
w
]
=
mval
;
}
}
template
<
typename
T
>
static
inline
void
ModulatedDeformableCol2imCoordCPU
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
grad_offset
,
T
*
grad_mask
)
{
int
num_kernels
=
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
]
*
deformable_groups
;
int
channel_per_deformable_group
=
col_shape
[
0
]
/
deformable_groups
;
ModulatedDeformableCol2imCoordCPUKernel
(
num_kernels
,
data_col
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
deformable_groups
,
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
grad_offset
,
grad_mask
);
}
template
<
typename
T
>
void
ModulatedDeformableIm2colCPUKernel
(
const
int
num_kernels
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
for
(
int
i
=
0
;
i
<
num_kernels
;
i
++
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
*
mask
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
>
static
inline
void
ModulatedDeformableIm2colCPU
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
filter_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
// get outputs of im2col with offset by bilinear interpolation
ModulatedDeformableIm2colCPUKernel
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
template
<
typename
T
>
void
FilterGradAddupCPUKernel
(
const
int
nthreads
,
const
int
n
,
const
int
height
,
const
int
width
,
const
T
*
dweight_3d
,
T
*
filter_grad
)
{
for
(
int
i
=
0
;
i
<
nthreads
;
i
++
)
{
filter_grad
[
i
]
=
filter_grad
[
i
]
+
dweight_3d
[
i
];
}
}
template
<
typename
T
>
class
DeformableConvGradCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
Tensor
*
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
Tensor
*
offset_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Offset"
));
Tensor
*
mask_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Mask"
));
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
Tensor
offset
=
*
ctx
.
Input
<
Tensor
>
(
"Offset"
);
Tensor
mask
=
*
ctx
.
Input
<
Tensor
>
(
"Mask"
);
Tensor
filter
=
*
ctx
.
Input
<
Tensor
>
(
"Filter"
);
if
(
!
input_grad
&&
!
filter_grad
&&
!
offset_grad
&&
!
mask_grad
)
return
;
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
CPUDeviceContext
>();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
framework
::
DDim
input_shape
=
phi
::
slice_ddim
(
input
->
dims
(),
1
,
input
->
dims
().
size
());
std
::
vector
<
int64_t
>
input_shape_vec
=
phi
::
vectorize
(
input_shape
);
std
::
vector
<
int64_t
>
filter_shape_vec
(
phi
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
phi
::
vectorize
(
output_grad
->
dims
()));
std
::
vector
<
int64_t
>
col_buffer_shape_vec
(
filter_shape_vec
.
size
());
col_buffer_shape_vec
[
0
]
=
input
->
dims
()[
1
]
*
filter
.
dims
()[
2
]
*
filter
.
dims
()[
3
];
col_buffer_shape_vec
[
1
]
=
im2col_step
;
for
(
size_t
j
=
0
;
j
<
filter_shape_vec
.
size
()
-
2
;
++
j
)
{
col_buffer_shape_vec
[
j
+
2
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
phi
::
make_ddim
(
col_buffer_shape_vec
));
std
::
vector
<
int64_t
>
output_buffer_shape_vec
(
1
);
output_buffer_shape_vec
[
0
]
=
batch_size
*
output_shape_vec
[
1
]
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
framework
::
DDim
output_shape
(
phi
::
make_ddim
(
output_buffer_shape_vec
));
Tensor
col_buffer
;
Tensor
output_buffer
;
col_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
col_shape
,
dev_ctx
);
output_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
output_shape
,
dev_ctx
);
output_buffer
.
ShareDataWith
(
*
output_grad
);
int64_t
M
=
input_shape_vec
[
0
]
/
groups
*
filter_shape_vec
[
2
]
*
filter_shape_vec
[
3
];
int64_t
N
=
im2col_step
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
int64_t
K
=
output_shape_vec
[
1
]
/
groups
;
framework
::
DDim
weight_3d_shape
=
{
groups
,
K
,
M
};
framework
::
DDim
out_grad_4d_shape
=
{
batch_size
/
im2col_step
,
groups
,
K
,
N
};
framework
::
DDim
col_buffer_3d_shape
=
{
groups
,
M
,
N
};
framework
::
DDim
filter_grad_shape
=
{
groups
,
K
,
M
};
Tensor
weight_3d
;
weight_3d
.
ShareDataWith
(
filter
).
Resize
(
weight_3d_shape
);
Tensor
out_grad_4d
;
out_grad_4d
.
ShareDataWith
(
output_buffer
).
Resize
(
out_grad_4d_shape
);
Tensor
col_buffer_3d
;
col_buffer_3d
.
ShareDataWith
(
col_buffer
).
Resize
(
col_buffer_3d_shape
);
phi
::
funcs
::
SetConstant
<
CPUDeviceContext
,
T
>
set_zero
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
CPUDeviceContext
,
T
>
(
dev_ctx
);
col_buffer
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
col_buffer_3d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out_grad_4d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
input_dim
=
input
->
numel
()
/
input
->
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
int
input_mask_dim
=
mask
.
numel
()
/
mask
.
dims
()[
0
];
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
filter_grad
->
Resize
(
filter_grad_shape
);
set_zero
(
dev_ctx
,
filter_grad
,
static_cast
<
T
>
(
0
));
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
}
if
(
offset_grad
&&
mask_grad
)
{
offset_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mask_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
offset_grad
,
static_cast
<
T
>
(
0
));
set_zero
(
dev_ctx
,
mask_grad
,
static_cast
<
T
>
(
0
));
}
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
Tensor
out_grad_3d
=
out_grad_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_4d
.
dims
(),
1
,
out_grad_4d
.
dims
().
size
()));
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
weight_3d_slice
=
weight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
weight_3d
.
dims
(),
1
,
weight_3d
.
dims
().
size
()));
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
blas
.
MatMul
(
weight_3d_slice
,
true
,
out_grad_3d_slice
,
false
,
T
(
1.0
),
&
col_buffer_3d_slice
,
T
(
0.0
));
}
col_buffer
.
Resize
(
col_shape
);
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
const
T
*
mask_ptr
=
mask
.
data
<
T
>
();
if
(
mask_grad
&&
offset_grad
)
{
T
*
offset_grad_ptr
=
offset_grad
->
data
<
T
>
();
T
*
mask_grad_ptr
=
mask_grad
->
data
<
T
>
();
// get grad of offset and mask
ModulatedDeformableCol2imCoordCPU
(
ctx
.
template
device_context
<
CPUDeviceContext
>(),
col_buffer_ptr
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
offset_grad_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_grad_ptr
+
i
*
im2col_step
*
input_mask_dim
);
}
if
(
input_grad
)
{
T
*
input_grad_ptr
=
input_grad
->
data
<
T
>
();
// get grad of input
ModulatedDeformableCol2imCPU
(
ctx
.
template
device_context
<
CPUDeviceContext
>(),
col_buffer_ptr
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
input_grad_ptr
+
i
*
im2col_step
*
input_dim
);
input_grad
->
Resize
(
input
->
dims
());
}
ModulatedDeformableIm2colCPU
(
ctx
.
template
device_context
<
CPUDeviceContext
>(),
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
col_buffer_3d
.
Resize
(
col_buffer_3d_shape
);
if
(
filter_grad
)
{
Tensor
dweight_3d
;
dweight_3d
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
filter_grad_shape
,
dev_ctx
);
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
Tensor
dweight_3d_slice
=
dweight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
dweight_3d
.
dims
(),
1
,
dweight_3d
.
dims
().
size
()));
blas
.
MatMul
(
out_grad_3d_slice
,
false
,
col_buffer_3d_slice
,
true
,
T
(
1.0
),
&
dweight_3d_slice
,
T
(
0.0
));
}
// update grad of weights
FilterGradAddupCPUKernel
(
dweight_3d
.
numel
(),
groups
,
K
,
M
,
dweight_3d
.
data
<
T
>
(),
filter_grad
->
data
<
T
>
());
}
}
if
(
filter_grad
)
{
filter_grad
->
Resize
(
filter
.
dims
());
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/deformable_conv_v1_op.cc
浏览文件 @
31363c3f
...
@@ -12,9 +12,11 @@
...
@@ -12,9 +12,11 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/deformable_conv_v1_op.h"
#include <memory>
#include <memory>
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -113,128 +115,6 @@ $$
...
@@ -113,128 +115,6 @@ $$
class
DeformableConvV1Op
:
public
framework
::
OperatorWithKernel
{
class
DeformableConvV1Op
:
public
framework
::
OperatorWithKernel
{
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Input"
),
"Input"
,
"Input"
,
"deformable_conv_v1"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Offset"
),
"Input"
,
"Offset"
,
"deformable_conv_v1"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Filter"
),
"Input"
,
"Filter"
,
"deformable_conv_v1"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Output"
),
"Output"
,
"Output"
,
"deformable_conv_v1"
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
auto
filter_dims
=
ctx
->
GetInputDim
(
"Filter"
);
auto
offset_dims
=
ctx
->
GetInputDim
(
"Offset"
);
std
::
vector
<
int
>
strides
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
->
Attrs
().
Get
<
int
>
(
"groups"
);
int
deformable_groups
=
ctx
->
Attrs
().
Get
<
int
>
(
"deformable_groups"
);
int
im2col_step
=
ctx
->
Attrs
().
Get
<
int
>
(
"im2col_step"
);
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
4
,
platform
::
errors
::
InvalidArgument
(
"Conv input should be 4-D tensor, get %u"
,
in_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
filter_dims
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Conv input dimension and filter dimension should be "
"the same. the difference is [%d] vs [%d]"
,
in_dims
.
size
(),
filter_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
.
size
()
-
strides
.
size
(),
2U
,
platform
::
errors
::
InvalidArgument
(
"Conv input dimension and strides "
"dimension should be consistent., But received [%d]: [%d]"
,
in_dims
.
size
(),
strides
.
size
()));
PADDLE_ENFORCE_EQ
(
paddings
.
size
(),
strides
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Conv paddings dimension and Conv strides dimension "
"should be the same. The difference is [%d] vs [%d]"
,
paddings
.
size
(),
strides
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
[
1
],
filter_dims
[
1
]
*
groups
,
platform
::
errors
::
InvalidArgument
(
"The number of input channels should be equal to filter "
"channels * groups. The difference is [%d]: [%d]"
,
in_dims
[
1
],
filter_dims
[
1
]
*
groups
));
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
]
%
groups
,
0
,
platform
::
errors
::
InvalidArgument
(
"The number of output channels should be divided by groups. But"
"received output channels: [%d], groups: [%d]"
,
filter_dims
[
0
],
groups
));
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
]
%
deformable_groups
,
0
,
platform
::
errors
::
InvalidArgument
(
"The number of output channels should be "
"divided by deformable groups. But received [%d]: [%d]"
,
filter_dims
[
0
],
deformable_groups
));
if
(
in_dims
[
0
]
>
im2col_step
)
{
PADDLE_ENFORCE_EQ
(
in_dims
[
0
]
%
im2col_step
,
0U
,
platform
::
errors
::
InvalidArgument
(
"Input batchsize must be smaller than or divide "
"im2col_step, But received [%d]: [%d]"
,
in_dims
[
0
],
im2col_step
));
}
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
PADDLE_ENFORCE_GT
(
strides
[
i
],
0U
,
platform
::
errors
::
InvalidArgument
(
"stride %d size incorrect"
,
i
));
}
for
(
size_t
i
=
0
;
i
<
dilations
.
size
();
++
i
)
{
PADDLE_ENFORCE_GT
(
dilations
[
i
],
0U
,
platform
::
errors
::
InvalidArgument
(
"dilation %d size incorrect"
,
i
));
}
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
if
((
!
ctx
->
IsRuntime
())
&&
(
in_dims
[
i
+
2
]
<=
0
||
filter_dims
[
i
+
2
]
<=
0
))
{
output_shape
.
push_back
(
-
1
);
}
else
{
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
}
if
(
ctx
->
IsRuntime
())
{
PADDLE_ENFORCE_EQ
(
output_shape
[
1
]
%
deformable_groups
,
0U
,
platform
::
errors
::
InvalidArgument
(
"output num_filter must divide deformable group "
"size. But received [%d]: [%d]"
,
output_shape
[
1
],
deformable_groups
));
PADDLE_ENFORCE_EQ
(
output_shape
[
2
],
offset_dims
[
2
],
platform
::
errors
::
InvalidArgument
(
"output height must equal to offset map height. "
"The difference is [%d]: [%d]"
,
output_shape
[
2
],
offset_dims
[
2
]));
PADDLE_ENFORCE_EQ
(
output_shape
[
3
],
offset_dims
[
3
],
platform
::
errors
::
InvalidArgument
(
"output width must equal to offset map width. The "
"difference is [%d]: [%d]"
,
output_shape
[
3
],
offset_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
offset_dims
[
1
]
%
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
0U
,
platform
::
errors
::
InvalidArgument
(
"offset filter must divide deformable group size. "
"But received [%d]: [%d]"
,
offset_dims
[
1
],
filter_dims
[
2
]
*
filter_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
offset_dims
[
1
]
/
(
2
*
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
,
platform
::
errors
::
InvalidArgument
(
"offset filter must divide deformable group size. But received "
"[%d]: [%d]"
,
offset_dims
[
1
]
/
(
2
*
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
));
}
ctx
->
SetOutputDim
(
"Output"
,
phi
::
make_ddim
(
output_shape
));
}
protected:
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
...
@@ -300,15 +180,12 @@ class DeformableConvV1GradOp : public framework::OperatorWithKernel {
...
@@ -300,15 +180,12 @@ class DeformableConvV1GradOp : public framework::OperatorWithKernel {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
DECLARE_INFER_SHAPE_FUNCTOR
(
deformable_conv
,
DeformableConvV1InferShapeFunctor
,
PD_INFER_META
(
phi
::
DeformableConvInferMeta
));
REGISTER_OPERATOR
(
deformable_conv_v1
,
ops
::
DeformableConvV1Op
,
REGISTER_OPERATOR
(
deformable_conv_v1
,
ops
::
DeformableConvV1Op
,
ops
::
DeformableConvV1OpMaker
,
ops
::
DeformableConvV1OpMaker
,
ops
::
DeformableConvV1GradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
DeformableConvV1GradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
DeformableConvV1GradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
ops
::
DeformableConvV1GradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
DeformableConvV1InferShapeFunctor
);
REGISTER_OPERATOR
(
deformable_conv_v1_grad
,
ops
::
DeformableConvV1GradOp
);
REGISTER_OPERATOR
(
deformable_conv_v1_grad
,
ops
::
DeformableConvV1GradOp
);
REGISTER_OP_CPU_KERNEL
(
deformable_conv_v1
,
ops
::
DeformableConvV1CPUKernel
<
float
>
,
ops
::
DeformableConvV1CPUKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
deformable_conv_v1_grad
,
ops
::
DeformableConvV1GradCPUKernel
<
float
>
,
ops
::
DeformableConvV1GradCPUKernel
<
double
>
);
paddle/fluid/operators/deformable_conv_v1_op.cu
已删除
100644 → 0
浏览文件 @
da478d1e
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_filter.cu.h"
#include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/fluid/operators/deformable_conv_v1_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
CUDADeviceContext
=
paddle
::
platform
::
CUDADeviceContext
;
static
constexpr
int
kNumCUDAThread
=
512
;
static
constexpr
int
kNumMaximumNumBlock
=
4096
;
static
inline
int
NumBlock
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThread
-
1
)
/
kNumCUDAThread
,
kNumMaximumNumBlock
);
}
template
<
typename
T
>
__global__
void
DeformableCol2imCUDAKernel
(
const
int
nthreads
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_im
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
thread
=
index
;
thread
<
nthreads
;
thread
+=
offset
)
{
const
int
j
=
(
thread
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
thread
%
width_col
;
int
h_out
=
(
thread
/
width_col
)
%
height_col
;
int
b
=
(
thread
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
const
T
cur_top_grad
=
data_col
[
thread
];
const
int
cur_h
=
static_cast
<
int
>
(
cur_inv_h_data
);
const
int
cur_w
=
static_cast
<
int
>
(
cur_inv_w_data
);
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
T
weight
=
DmcnGetGradientWeight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
platform
::
CudaAtomicAdd
(
grad_im
+
cur_bottom_grad_pos
,
weight
*
cur_top_grad
);
}
}
}
}
}
template
<
typename
T
>
inline
void
DeformableCol2im
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
pad
,
const
std
::
vector
<
int
>
stride
,
const
std
::
vector
<
int
>
dilation
,
const
int
deformable_group
,
T
*
grad_im
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_group
;
int
num_kernels
=
col_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
int
blocks
=
NumBlock
(
num_kernels
);
int
threads
=
kNumCUDAThread
;
DeformableCol2imCUDAKernel
<
T
><<<
blocks
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
()
>>>
(
num_kernels
,
data_col
,
data_offset
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
pad
[
0
],
pad
[
1
],
stride
[
0
],
stride
[
1
],
dilation
[
0
],
dilation
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
deformable_group
,
col_shape
[
2
],
col_shape
[
3
],
grad_im
);
}
template
<
typename
T
>
__global__
void
DeformableCol2imCoordCUDAKernel
(
const
int
nthreads
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_offset
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
T
val
=
0
,
mval
=
0
;
const
int
w
=
i
%
width_col
;
const
int
h
=
(
i
/
width_col
)
%
height_col
;
const
int
c
=
(
i
/
width_col
/
height_col
)
%
offset_channels
;
const
int
b
=
(
i
/
width_col
/
height_col
)
/
offset_channels
;
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
T
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
offset_c
/
2
;
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
T
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
else
{
mval
+=
data_col_ptr
[
col_pos
]
*
DmcnIm2colBilinear
(
data_im_ptr
+
cnt
*
height
*
width
,
width
,
height
,
width
,
inv_h
,
inv_w
);
}
const
T
weight
=
DmcnGetCoordinateWeight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
val
+=
weight
*
data_col_ptr
[
col_pos
];
cnt
+=
1
;
}
grad_offset
[
i
]
=
val
;
}
}
template
<
typename
T
>
inline
void
DeformableCol2imCoord
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
grad_offset
)
{
int
num_kernels
=
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
]
*
deformable_groups
;
int
channel_per_deformable_group
=
col_shape
[
0
]
/
deformable_groups
;
int
blocks
=
NumBlock
(
num_kernels
);
int
threads
=
kNumCUDAThread
;
DeformableCol2imCoordCUDAKernel
<
T
><<<
blocks
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
()
>>>
(
num_kernels
,
data_col
,
data_im
,
data_offset
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
deformable_groups
,
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
grad_offset
);
}
template
<
typename
T
>
__global__
void
DeformableIm2colCUDAKernel
(
const
int
nthreads
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
>
inline
void
DeformableIm2col
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
filter_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
int
blocks
=
NumBlock
(
num_kernels
);
int
threads
=
kNumCUDAThread
;
// get outputs of im2col with offset by bilinear interpolation
DeformableIm2colCUDAKernel
<
T
><<<
blocks
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
()
>>>
(
num_kernels
,
data_im
,
data_offset
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
template
<
typename
T
>
class
DeformableConvV1CUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
const
Tensor
offset
=
*
ctx
.
Input
<
Tensor
>
(
"Offset"
);
Tensor
filter
=
*
ctx
.
Input
<
Tensor
>
(
"Filter"
);
Tensor
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
CUDADeviceContext
>();
const
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
const
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
const
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
const
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
phi
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
phi
::
vectorize
(
output
->
dims
()));
// col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
std
::
vector
<
int64_t
>
col_buffer_shape_vec
(
filter_shape_vec
.
size
());
col_buffer_shape_vec
[
0
]
=
input
->
dims
()[
1
]
*
filter
.
dims
()[
2
]
*
filter
.
dims
()[
3
];
col_buffer_shape_vec
[
1
]
=
im2col_step
;
for
(
size_t
j
=
0
;
j
<
filter_shape_vec
.
size
()
-
2
;
++
j
)
{
col_buffer_shape_vec
[
j
+
2
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
phi
::
make_ddim
(
col_buffer_shape_vec
));
std
::
vector
<
int64_t
>
output_buffer_shape_vec
(
1
);
output_buffer_shape_vec
[
0
]
=
batch_size
*
output_shape_vec
[
1
]
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
framework
::
DDim
output_shape
(
phi
::
make_ddim
(
output_buffer_shape_vec
));
Tensor
col_buffer
;
Tensor
output_buffer
;
col_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CUDADeviceContext
>
(
col_shape
,
dev_ctx
);
output_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CUDADeviceContext
>
(
output_shape
,
dev_ctx
);
int64_t
M
=
output_shape_vec
[
1
]
/
groups
;
int64_t
N
=
im2col_step
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
int64_t
K
=
input
->
dims
()[
1
]
*
filter_shape_vec
[
2
]
*
filter_shape_vec
[
3
]
/
groups
;
Tensor
weight_3d
;
weight_3d
.
ShareDataWith
(
filter
).
Resize
(
phi
::
make_ddim
({
groups
,
M
,
K
}));
Tensor
col_buffer_3d
;
col_buffer_3d
.
ShareDataWith
(
col_buffer
)
.
Resize
(
phi
::
make_ddim
({
groups
,
K
,
N
}));
Tensor
output_4d
;
output_4d
.
ShareDataWith
(
output_buffer
)
.
Resize
(
phi
::
make_ddim
({
batch_size
/
im2col_step
,
groups
,
M
,
N
}));
output_4d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
DDim
input_shape
=
phi
::
slice_ddim
(
input
->
dims
(),
1
,
input
->
dims
().
size
());
std
::
vector
<
int64_t
>
input_shape_vec
=
phi
::
vectorize
(
input_shape
);
int
input_dim
=
input
->
numel
()
/
input
->
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
auto
blas
=
phi
::
funcs
::
GetBlas
<
CUDADeviceContext
,
T
>
(
dev_ctx
);
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
col_buffer
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
DeformableIm2col
(
dev_ctx
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
Tensor
output_3d
=
output_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
output_4d
.
dims
(),
1
,
output_4d
.
dims
().
size
()));
// get the product of pixel and weight
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
weight_3d_slice
=
weight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
weight_3d
.
dims
(),
1
,
weight_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
Tensor
output_3d_slice
=
output_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
output_3d
.
dims
(),
1
,
output_3d
.
dims
().
size
()));
blas
.
MatMul
(
weight_3d_slice
,
false
,
col_buffer_3d_slice
,
false
,
T
(
1.0
),
&
output_3d_slice
,
T
(
0.0
));
}
}
output
->
ShareDataWith
(
output_buffer
)
.
Resize
(
phi
::
make_ddim
(
output_shape_vec
));
}
};
template
<
typename
T
>
class
DeformableConvV1GradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
Tensor
*
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
Tensor
*
offset_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Offset"
));
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
Tensor
offset
=
*
ctx
.
Input
<
Tensor
>
(
"Offset"
);
Tensor
filter
=
*
ctx
.
Input
<
Tensor
>
(
"Filter"
);
if
(
!
input_grad
&&
!
filter_grad
&&
!
offset_grad
)
return
;
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
CUDADeviceContext
>();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
framework
::
DDim
input_shape
=
phi
::
slice_ddim
(
input
->
dims
(),
1
,
input
->
dims
().
size
());
std
::
vector
<
int64_t
>
input_shape_vec
=
phi
::
vectorize
(
input_shape
);
std
::
vector
<
int64_t
>
filter_shape_vec
(
phi
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
phi
::
vectorize
(
output_grad
->
dims
()));
std
::
vector
<
int64_t
>
col_buffer_shape_vec
(
filter_shape_vec
.
size
());
col_buffer_shape_vec
[
0
]
=
input
->
dims
()[
1
]
*
filter
.
dims
()[
2
]
*
filter
.
dims
()[
3
];
col_buffer_shape_vec
[
1
]
=
im2col_step
;
for
(
size_t
j
=
0
;
j
<
filter_shape_vec
.
size
()
-
2
;
++
j
)
{
col_buffer_shape_vec
[
j
+
2
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
phi
::
make_ddim
(
col_buffer_shape_vec
));
std
::
vector
<
int64_t
>
output_buffer_shape_vec
(
1
);
output_buffer_shape_vec
[
0
]
=
batch_size
*
output_shape_vec
[
1
]
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
framework
::
DDim
output_shape
(
phi
::
make_ddim
(
output_buffer_shape_vec
));
Tensor
col_buffer
;
Tensor
output_buffer
;
col_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CUDADeviceContext
>
(
col_shape
,
dev_ctx
);
output_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CUDADeviceContext
>
(
output_shape
,
dev_ctx
);
output_buffer
.
ShareDataWith
(
*
output_grad
);
int64_t
M
=
input_shape_vec
[
0
]
/
groups
*
filter_shape_vec
[
2
]
*
filter_shape_vec
[
3
];
int64_t
N
=
im2col_step
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
int64_t
K
=
output_shape_vec
[
1
]
/
groups
;
framework
::
DDim
weight_3d_shape
=
{
groups
,
K
,
M
};
framework
::
DDim
out_grad_4d_shape
=
{
batch_size
/
im2col_step
,
groups
,
K
,
N
};
framework
::
DDim
col_buffer_3d_shape
=
{
groups
,
M
,
N
};
framework
::
DDim
filter_grad_shape
=
{
groups
,
K
,
M
};
Tensor
weight_3d
;
weight_3d
.
ShareDataWith
(
filter
).
Resize
(
weight_3d_shape
);
Tensor
out_grad_4d
;
out_grad_4d
.
ShareDataWith
(
output_buffer
).
Resize
(
out_grad_4d_shape
);
Tensor
col_buffer_3d
;
col_buffer_3d
.
ShareDataWith
(
col_buffer
).
Resize
(
col_buffer_3d_shape
);
phi
::
funcs
::
SetConstant
<
CUDADeviceContext
,
T
>
set_zero
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
CUDADeviceContext
,
T
>
(
dev_ctx
);
col_buffer
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
col_buffer_3d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out_grad_4d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
input_dim
=
input
->
numel
()
/
input
->
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
filter_grad
->
Resize
(
filter_grad_shape
);
set_zero
(
dev_ctx
,
filter_grad
,
static_cast
<
T
>
(
0
));
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
}
if
(
offset_grad
)
{
offset_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
offset_grad
,
static_cast
<
T
>
(
0
));
}
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
Tensor
out_grad_3d
=
out_grad_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_4d
.
dims
(),
1
,
out_grad_4d
.
dims
().
size
()));
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
weight_3d_slice
=
weight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
weight_3d
.
dims
(),
1
,
weight_3d
.
dims
().
size
()));
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
blas
.
MatMul
(
weight_3d_slice
,
true
,
out_grad_3d_slice
,
false
,
T
(
1.0
),
&
col_buffer_3d_slice
,
T
(
0.0
));
}
col_buffer
.
Resize
(
col_shape
);
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
if
(
offset_grad
)
{
T
*
offset_grad_ptr
=
offset_grad
->
data
<
T
>
();
// get grad of offset
DeformableCol2imCoord
(
dev_ctx
,
col_buffer_ptr
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
offset_grad_ptr
+
i
*
im2col_step
*
input_offset_dim
);
}
if
(
input_grad
)
{
T
*
input_grad_ptr
=
input_grad
->
data
<
T
>
();
// get grad of input
DeformableCol2im
(
dev_ctx
,
col_buffer_ptr
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
input_grad_ptr
+
i
*
im2col_step
*
input_dim
);
input_grad
->
Resize
(
input
->
dims
());
}
DeformableIm2col
(
dev_ctx
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
col_buffer_3d
.
Resize
(
col_buffer_3d_shape
);
if
(
filter_grad
)
{
Tensor
dweight_3d
;
dweight_3d
=
ctx
.
AllocateTmpTensor
<
T
,
CUDADeviceContext
>
(
filter_grad_shape
,
dev_ctx
);
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
Tensor
dweight_3d_slice
=
dweight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
dweight_3d
.
dims
(),
1
,
dweight_3d
.
dims
().
size
()));
blas
.
MatMul
(
out_grad_3d_slice
,
false
,
col_buffer_3d_slice
,
true
,
T
(
1.0
),
&
dweight_3d_slice
,
T
(
0.0
));
}
FilterGradAddupCUDAKernel
<
T
><<<
NumBlock
(
dweight_3d
.
numel
()),
kNumCUDAThread
,
0
,
dev_ctx
.
stream
()
>>>
(
dweight_3d
.
numel
(),
groups
,
K
,
M
,
dweight_3d
.
data
<
T
>
(),
filter_grad
->
data
<
T
>
());
}
}
if
(
filter_grad
)
{
filter_grad
->
Resize
(
filter
.
dims
());
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
deformable_conv_v1
,
ops
::
DeformableConvV1CUDAKernel
<
float
>
,
ops
::
DeformableConvV1CUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
deformable_conv_v1_grad
,
ops
::
DeformableConvV1GradCUDAKernel
<
float
>
,
ops
::
DeformableConvV1GradCUDAKernel
<
double
>
);
paddle/fluid/operators/deformable_conv_v1_op.h
已删除
100644 → 0
浏览文件 @
da478d1e
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/fluid/operators/deformable_conv_op.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
CPUDeviceContext
=
platform
::
CPUDeviceContext
;
template
<
typename
T
>
void
DeformableCol2imCPUKernel
(
const
int
num_kernels
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_im
)
{
for
(
int
thread
=
0
;
thread
<
num_kernels
;
thread
++
)
{
const
int
j
=
(
thread
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
thread
%
width_col
;
int
h_out
=
(
thread
/
width_col
)
%
height_col
;
int
b
=
(
thread
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
const
T
cur_top_grad
=
data_col
[
thread
];
const
int
cur_h
=
static_cast
<
int
>
(
cur_inv_h_data
);
const
int
cur_w
=
static_cast
<
int
>
(
cur_inv_w_data
);
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
T
weight
=
DmcnGetGradientWeight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
*
(
grad_im
+
cur_bottom_grad_pos
)
=
*
(
grad_im
+
cur_bottom_grad_pos
)
+
weight
*
cur_top_grad
;
}
}
}
}
}
template
<
typename
T
>
inline
void
DeformableCol2imCPU
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
pad
,
const
std
::
vector
<
int
>
stride
,
const
std
::
vector
<
int
>
dilation
,
const
int
deformable_group
,
T
*
grad_im
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_group
;
int
num_kernels
=
col_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
DeformableCol2imCPUKernel
(
num_kernels
,
data_col
,
data_offset
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
pad
[
0
],
pad
[
1
],
stride
[
0
],
stride
[
1
],
dilation
[
0
],
dilation
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
deformable_group
,
col_shape
[
2
],
col_shape
[
3
],
grad_im
);
}
template
<
typename
T
>
void
DeformableCol2imCoordCPUKernel
(
const
int
num_kernels
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_offset
)
{
for
(
int
i
=
0
;
i
<
num_kernels
;
i
++
)
{
T
val
=
0
,
mval
=
0
;
const
int
w
=
i
%
width_col
;
const
int
h
=
(
i
/
width_col
)
%
height_col
;
const
int
c
=
(
i
/
width_col
/
height_col
)
%
offset_channels
;
const
int
b
=
(
i
/
width_col
/
height_col
)
/
offset_channels
;
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
T
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
offset_c
/
2
;
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
T
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
else
{
mval
+=
data_col_ptr
[
col_pos
]
*
DmcnIm2colBilinear
(
data_im_ptr
+
cnt
*
height
*
width
,
width
,
height
,
width
,
inv_h
,
inv_w
);
}
const
T
weight
=
DmcnGetCoordinateWeight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
val
+=
weight
*
data_col_ptr
[
col_pos
];
cnt
+=
1
;
}
grad_offset
[
i
]
=
val
;
}
}
template
<
typename
T
>
inline
void
DeformableCol2imCoordCPU
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
kernel_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
grad_offset
)
{
int
num_kernels
=
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
]
*
deformable_groups
;
int
channel_per_deformable_group
=
col_shape
[
0
]
/
deformable_groups
;
DeformableCol2imCoordCPUKernel
(
num_kernels
,
data_col
,
data_im
,
data_offset
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
deformable_groups
,
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
grad_offset
);
}
template
<
typename
T
>
void
DeformableIm2colCPUKernel
(
const
int
num_kernels
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
for
(
int
i
=
0
;
i
<
num_kernels
;
i
++
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
>
inline
void
DeformableIm2colCPU
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
std
::
vector
<
int64_t
>
im_shape
,
const
std
::
vector
<
int64_t
>
col_shape
,
const
std
::
vector
<
int64_t
>
filter_shape
,
const
std
::
vector
<
int
>
paddings
,
const
std
::
vector
<
int
>
strides
,
const
std
::
vector
<
int
>
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
// get outputs of im2col with offset by bilinear interpolation
DeformableIm2colCPUKernel
(
num_kernels
,
data_im
,
data_offset
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
template
<
typename
T
>
class
DeformableConvV1CPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
*
offset
=
ctx
.
Input
<
Tensor
>
(
"Offset"
);
Tensor
filter
=
*
ctx
.
Input
<
Tensor
>
(
"Filter"
);
Tensor
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
CPUDeviceContext
>();
const
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
const
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
const
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
const
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
phi
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
phi
::
vectorize
(
output
->
dims
()));
// col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
std
::
vector
<
int64_t
>
col_buffer_shape_vec
(
filter_shape_vec
.
size
());
col_buffer_shape_vec
[
0
]
=
input
->
dims
()[
1
]
*
filter
.
dims
()[
2
]
*
filter
.
dims
()[
3
];
col_buffer_shape_vec
[
1
]
=
im2col_step
;
for
(
size_t
j
=
0
;
j
<
filter_shape_vec
.
size
()
-
2
;
++
j
)
{
col_buffer_shape_vec
[
j
+
2
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
phi
::
make_ddim
(
col_buffer_shape_vec
));
std
::
vector
<
int64_t
>
output_buffer_shape_vec
(
1
);
output_buffer_shape_vec
[
0
]
=
batch_size
*
output_shape_vec
[
1
]
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
framework
::
DDim
output_shape
(
phi
::
make_ddim
(
output_buffer_shape_vec
));
Tensor
col_buffer
;
Tensor
output_buffer
;
col_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
col_shape
,
dev_ctx
);
output_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
output_shape
,
dev_ctx
);
int64_t
M
=
output_shape_vec
[
1
]
/
groups
;
int64_t
N
=
im2col_step
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
int64_t
K
=
input
->
dims
()[
1
]
*
filter_shape_vec
[
2
]
*
filter_shape_vec
[
3
]
/
groups
;
Tensor
weight_3d
;
weight_3d
.
ShareDataWith
(
filter
).
Resize
(
phi
::
make_ddim
({
groups
,
M
,
K
}));
Tensor
col_buffer_3d
;
col_buffer_3d
.
ShareDataWith
(
col_buffer
)
.
Resize
(
phi
::
make_ddim
({
groups
,
K
,
N
}));
Tensor
output_4d
;
output_4d
.
ShareDataWith
(
output_buffer
)
.
Resize
(
phi
::
make_ddim
({
batch_size
/
im2col_step
,
groups
,
M
,
N
}));
output_4d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
DDim
input_shape
=
phi
::
slice_ddim
(
input
->
dims
(),
1
,
input
->
dims
().
size
());
std
::
vector
<
int64_t
>
input_shape_vec
=
phi
::
vectorize
(
input_shape
);
int
input_dim
=
input
->
numel
()
/
input
->
dims
()[
0
];
int
input_offset_dim
=
offset
->
numel
()
/
offset
->
dims
()[
0
];
auto
blas
=
phi
::
funcs
::
GetBlas
<
CPUDeviceContext
,
T
>
(
dev_ctx
);
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
offset_ptr
=
offset
->
data
<
T
>
();
col_buffer
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
DeformableIm2colCPU
(
dev_ctx
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
Tensor
output_3d
=
output_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
output_4d
.
dims
(),
1
,
output_4d
.
dims
().
size
()));
// get the product of pixel and weight
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
weight_3d_slice
=
weight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
weight_3d
.
dims
(),
1
,
weight_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
Tensor
output_3d_slice
=
output_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
output_3d
.
dims
(),
1
,
output_3d
.
dims
().
size
()));
blas
.
MatMul
(
weight_3d_slice
,
false
,
col_buffer_3d_slice
,
false
,
T
(
1.0
),
&
output_3d_slice
,
T
(
0.0
));
}
}
output
->
ShareDataWith
(
output_buffer
)
.
Resize
(
phi
::
make_ddim
(
output_shape_vec
));
}
};
template
<
typename
T
>
class
DeformableConvV1GradCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
Tensor
*
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
Tensor
*
offset_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Offset"
));
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
Tensor
offset
=
*
ctx
.
Input
<
Tensor
>
(
"Offset"
);
Tensor
filter
=
*
ctx
.
Input
<
Tensor
>
(
"Filter"
);
if
(
!
input_grad
&&
!
filter_grad
&&
!
offset_grad
)
return
;
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
CPUDeviceContext
>();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
framework
::
DDim
input_shape
=
phi
::
slice_ddim
(
input
->
dims
(),
1
,
input
->
dims
().
size
());
std
::
vector
<
int64_t
>
input_shape_vec
=
phi
::
vectorize
(
input_shape
);
std
::
vector
<
int64_t
>
filter_shape_vec
(
phi
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
phi
::
vectorize
(
output_grad
->
dims
()));
std
::
vector
<
int64_t
>
col_buffer_shape_vec
(
filter_shape_vec
.
size
());
col_buffer_shape_vec
[
0
]
=
input
->
dims
()[
1
]
*
filter
.
dims
()[
2
]
*
filter
.
dims
()[
3
];
col_buffer_shape_vec
[
1
]
=
im2col_step
;
for
(
size_t
j
=
0
;
j
<
filter_shape_vec
.
size
()
-
2
;
++
j
)
{
col_buffer_shape_vec
[
j
+
2
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
phi
::
make_ddim
(
col_buffer_shape_vec
));
std
::
vector
<
int64_t
>
output_buffer_shape_vec
(
1
);
output_buffer_shape_vec
[
0
]
=
batch_size
*
output_shape_vec
[
1
]
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
framework
::
DDim
output_shape
(
phi
::
make_ddim
(
output_buffer_shape_vec
));
Tensor
col_buffer
;
Tensor
output_buffer
;
col_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
col_shape
,
dev_ctx
);
output_buffer
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
output_shape
,
dev_ctx
);
output_buffer
.
ShareDataWith
(
*
output_grad
);
int64_t
M
=
input_shape_vec
[
0
]
/
groups
*
filter_shape_vec
[
2
]
*
filter_shape_vec
[
3
];
int64_t
N
=
im2col_step
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
int64_t
K
=
output_shape_vec
[
1
]
/
groups
;
framework
::
DDim
weight_3d_shape
=
{
groups
,
K
,
M
};
framework
::
DDim
out_grad_4d_shape
=
{
batch_size
/
im2col_step
,
groups
,
K
,
N
};
framework
::
DDim
col_buffer_3d_shape
=
{
groups
,
M
,
N
};
framework
::
DDim
filter_grad_shape
=
{
groups
,
K
,
M
};
Tensor
weight_3d
;
weight_3d
.
ShareDataWith
(
filter
).
Resize
(
weight_3d_shape
);
Tensor
out_grad_4d
;
out_grad_4d
.
ShareDataWith
(
output_buffer
).
Resize
(
out_grad_4d_shape
);
Tensor
col_buffer_3d
;
col_buffer_3d
.
ShareDataWith
(
col_buffer
).
Resize
(
col_buffer_3d_shape
);
phi
::
funcs
::
SetConstant
<
CPUDeviceContext
,
T
>
set_zero
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
CPUDeviceContext
,
T
>
(
dev_ctx
);
col_buffer
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
col_buffer_3d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out_grad_4d
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
input_dim
=
input
->
numel
()
/
input
->
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
filter_grad
->
Resize
(
filter_grad_shape
);
set_zero
(
dev_ctx
,
filter_grad
,
static_cast
<
T
>
(
0
));
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
}
if
(
offset_grad
)
{
offset_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_zero
(
dev_ctx
,
offset_grad
,
static_cast
<
T
>
(
0
));
}
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
Tensor
out_grad_3d
=
out_grad_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_4d
.
dims
(),
1
,
out_grad_4d
.
dims
().
size
()));
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
weight_3d_slice
=
weight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
weight_3d
.
dims
(),
1
,
weight_3d
.
dims
().
size
()));
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
blas
.
MatMul
(
weight_3d_slice
,
true
,
out_grad_3d_slice
,
false
,
T
(
1.0
),
&
col_buffer_3d_slice
,
T
(
0.0
));
}
col_buffer
.
Resize
(
col_shape
);
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
const
T
*
input_ptr
=
input
->
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
if
(
offset_grad
)
{
T
*
offset_grad_ptr
=
offset_grad
->
data
<
T
>
();
// get grad of offset
DeformableCol2imCoordCPU
(
dev_ctx
,
col_buffer_ptr
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
offset_grad_ptr
+
i
*
im2col_step
*
input_offset_dim
);
}
if
(
input_grad
)
{
T
*
input_grad_ptr
=
input_grad
->
data
<
T
>
();
// get grad of input
DeformableCol2imCPU
(
dev_ctx
,
col_buffer_ptr
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
input_grad_ptr
+
i
*
im2col_step
*
input_dim
);
input_grad
->
Resize
(
input
->
dims
());
}
DeformableIm2colCPU
(
dev_ctx
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
col_buffer_3d
.
Resize
(
col_buffer_3d_shape
);
if
(
filter_grad
)
{
Tensor
dweight_3d
;
dweight_3d
=
ctx
.
AllocateTmpTensor
<
T
,
CPUDeviceContext
>
(
filter_grad_shape
,
dev_ctx
);
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
Tensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
Tensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
Tensor
dweight_3d_slice
=
dweight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
dweight_3d
.
dims
(),
1
,
dweight_3d
.
dims
().
size
()));
blas
.
MatMul
(
out_grad_3d_slice
,
false
,
col_buffer_3d_slice
,
true
,
T
(
1.0
),
&
dweight_3d_slice
,
T
(
0.0
));
}
// update grad of weights
FilterGradAddupCPUKernel
(
dweight_3d
.
numel
(),
groups
,
K
,
M
,
dweight_3d
.
data
<
T
>
(),
filter_grad
->
data
<
T
>
());
}
}
if
(
filter_grad
)
{
filter_grad
->
Resize
(
filter
.
dims
());
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/flatten_op.cc
浏览文件 @
31363c3f
...
@@ -17,7 +17,10 @@ limitations under the License. */
...
@@ -17,7 +17,10 @@ limitations under the License. */
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -270,70 +273,24 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
...
@@ -270,70 +273,24 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
class
FlattenContiguousRangeOp
:
public
framework
::
OperatorWithKernel
{
class
FlattenContiguousRangeOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"FlattenContiguousRange"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"FlattenContiguousRange"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"FlattenContiguousRange"
);
"FlattenContiguousRange"
);
const
auto
&
start_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"start_axis"
);
const
auto
&
start_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"start_axis"
);
const
auto
&
stop_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"stop_axis"
);
const
auto
&
stop_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"stop_axis"
);
const
auto
&
in_dims
=
ctx
->
GetInputDim
(
"X"
);
int
in_dims_size
=
in_dims
.
size
();
int
real_start_axis
=
start_axis
,
real_stop_axis
=
stop_axis
;
if
(
start_axis
<
0
)
{
real_start_axis
=
start_axis
+
in_dims_size
;
}
if
(
stop_axis
<
0
)
{
real_stop_axis
=
stop_axis
+
in_dims_size
;
}
PADDLE_ENFORCE_GE
(
real_stop_axis
,
real_start_axis
,
platform
::
errors
::
InvalidArgument
(
"The stop_axis should be greater"
"than or equal to start_axis."
));
const
auto
&
out_dims
=
// Construct MetaTensor for InferMeta Func
GetOutputShape
(
real_start_axis
,
real_stop_axis
,
in_dims
);
using
CompatMetaTensor
=
framework
::
CompatMetaTensor
;
ctx
->
SetOutputDim
(
"Out"
,
phi
::
make_ddim
(
out_dims
));
CompatMetaTensor
x
(
ctx
->
GetInputVarPtrs
(
"X"
)[
0
],
ctx
->
IsRuntime
());
if
(
in_dims
[
0
]
==
out_dims
[
0
])
{
CompatMetaTensor
out
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
],
ctx
->
IsRuntime
());
// Only pass LoD when the first dimension of output and Input(X)
std
::
unique_ptr
<
CompatMetaTensor
>
xshape
(
nullptr
);
// are the same.
if
(
ctx
->
HasOutput
(
"XShape"
))
{
ctx
->
ShareLoD
(
"X"
,
"Out"
);
xshape
=
std
::
move
(
std
::
unique_ptr
<
CompatMetaTensor
>
(
new
CompatMetaTensor
(
}
ctx
->
GetOutputVarPtrs
(
"XShape"
)[
0
],
ctx
->
IsRuntime
())));
if
(
!
ctx
->
HasOutput
(
"XShape"
))
return
;
// OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2");
std
::
vector
<
int64_t
>
xshape_dims
(
in_dims
.
size
()
+
1
);
xshape_dims
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
in_dims
.
size
();
++
i
)
{
xshape_dims
[
i
+
1
]
=
in_dims
[
i
];
}
}
ctx
->
SetOutputDim
(
"XShape"
,
phi
::
make_ddim
(
xshape_dims
));
phi
::
FlattenWithXShapeInferMeta
(
x
,
start_axis
,
stop_axis
,
&
out
,
ctx
->
ShareLoD
(
"X"
,
"XShape"
);
xshape
.
get
());
}
static
std
::
vector
<
int32_t
>
GetOutputShape
(
const
int
start_axis
,
const
int
stop_axis
,
const
framework
::
DDim
&
in_dims
)
{
int64_t
outer
=
1
;
std
::
vector
<
int32_t
>
out_shape
;
int
in_dims_size
=
in_dims
.
size
();
out_shape
.
reserve
(
in_dims_size
-
stop_axis
+
start_axis
);
for
(
int
i
=
0
;
i
<
start_axis
;
++
i
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
for
(
int
i
=
start_axis
;
i
<=
stop_axis
;
i
++
)
{
if
(
in_dims
[
i
]
==
-
1
||
outer
==
-
1
)
{
outer
=
-
1
;
}
else
{
outer
*=
in_dims
[
i
];
}
}
out_shape
.
push_back
(
outer
);
for
(
int
i
=
stop_axis
+
1
;
i
<
in_dims_size
;
i
++
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
return
out_shape
;
}
}
};
};
...
@@ -487,30 +444,3 @@ REGISTER_OP_CPU_KERNEL(
...
@@ -487,30 +444,3 @@ REGISTER_OP_CPU_KERNEL(
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
flatten_contiguous_range
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
uint8_t
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
flatten_contiguous_range_grad
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
uint8_t
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/flatten_op.cu.cc
浏览文件 @
31363c3f
...
@@ -47,34 +47,3 @@ REGISTER_OP_CUDA_KERNEL(
...
@@ -47,34 +47,3 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
flatten_contiguous_range
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
flatten_contiguous_range_grad
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/flatten_op.h
浏览文件 @
31363c3f
...
@@ -119,46 +119,5 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
...
@@ -119,46 +119,5 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
}
}
};
};
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenContiguousRangeKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
out
->
mutable_data
(
context
.
GetPlace
(),
in
->
type
());
auto
&
start_axis
=
context
.
Attr
<
int
>
(
"start_axis"
);
auto
&
stop_axis
=
context
.
Attr
<
int
>
(
"stop_axis"
);
auto
&
dev_ctx
=
context
.
device_context
<
DeviceContext
>
();
// call new kernel
phi
::
FlattenKernel
<
T
,
typename
paddle
::
framework
::
ConvertToPhiContext
<
DeviceContext
>::
TYPE
>
(
static_cast
<
const
typename
paddle
::
framework
::
ConvertToPhiContext
<
DeviceContext
>::
TYPE
&>
(
dev_ctx
),
*
in
,
start_axis
,
stop_axis
,
out
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenContiguousRangeGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_out
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
xshape
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"XShape"
);
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
auto
&
dev_ctx
=
ctx
.
device_context
<
DeviceContext
>
();
// call new kernel
phi
::
FlattenGradKernel
<
T
,
typename
paddle
::
framework
::
ConvertToPhiContext
<
DeviceContext
>::
TYPE
>
(
static_cast
<
const
typename
paddle
::
framework
::
ConvertToPhiContext
<
DeviceContext
>::
TYPE
&>
(
dev_ctx
),
*
d_out
,
*
xshape
,
d_x
);
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/flatten_op_xpu.cc
浏览文件 @
31363c3f
...
@@ -41,27 +41,4 @@ REGISTER_OP_XPU_KERNEL(
...
@@ -41,27 +41,4 @@ REGISTER_OP_XPU_KERNEL(
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int64_t
>
);
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int64_t
>
);
REGISTER_OP_XPU_KERNEL
(
flatten_contiguous_range
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
XPUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
XPUDeviceContext
,
plat
::
float16
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int64_t
>
);
REGISTER_OP_XPU_KERNEL
(
flatten_contiguous_range_grad
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
plat
::
float16
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
XPUDeviceContext
,
int64_t
>
);
#endif
#endif
paddle/fluid/pybind/eager_method.cc
浏览文件 @
31363c3f
...
@@ -868,16 +868,22 @@ static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
...
@@ -868,16 +868,22 @@ static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
int64_t
hook_id
;
int64_t
hook_id
;
if
(
egr
::
egr_utils_api
::
IsLeafTensor
(
self
->
tensor
))
{
if
(
egr
::
egr_utils_api
::
IsLeafTensor
(
self
->
tensor
))
{
VLOG
(
6
)
<<
"Register hook for leaf tensor: "
<<
self
->
tensor
.
name
();
VLOG
(
6
)
<<
"Register hook for leaf tensor: "
<<
self
->
tensor
.
name
();
auto
autograd_meta
=
egr
::
EagerUtils
::
unsafe_autograd_meta
(
self
->
tensor
);
if
(
autograd_meta
&&
!
autograd_meta
->
StopGradient
())
{
if
(
!
autograd_meta
->
GetMutableGradNode
())
{
VLOG
(
6
)
<<
"Detected NULL grad_node, Leaf tensor should have had "
"grad_node with type: GradNodeAccumulation."
;
autograd_meta
->
SetGradNode
(
std
::
make_shared
<
egr
::
GradNodeAccumulation
>
(
autograd_meta
));
}
}
std
::
shared_ptr
<
egr
::
GradNodeBase
>
grad_node
=
std
::
shared_ptr
<
egr
::
GradNodeBase
>
grad_node
=
egr
::
EagerUtils
::
grad_node
(
self
->
tensor
);
egr
::
EagerUtils
::
grad_node
(
self
->
tensor
);
PADDLE_ENFORCE
(
grad_node
.
get
()
!=
nullptr
,
paddle
::
platform
::
errors
::
Fatal
(
"Detected NULL grad_node,"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation."
));
auto
rank_info
=
auto
rank_info
=
egr
::
EagerUtils
::
unsafe_autograd_meta
(
self
->
tensor
)
->
OutRankInfo
();
egr
::
EagerUtils
::
unsafe_autograd_meta
(
self
->
tensor
)
->
OutRankInfo
();
PyObject
*
hook_func
=
PyTuple_GET_ITEM
(
args
,
0
);
PyObject
*
hook_func
=
PyTuple_GET_ITEM
(
args
,
0
);
auto
accumulation_grad_node
=
auto
accumulation_grad_node
=
...
@@ -948,8 +954,8 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
...
@@ -948,8 +954,8 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL
EAGER_CATCH_AND_THROW_RETURN_NULL
}
}
static
PyObject
*
set_grad_type
(
TensorObject
*
self
,
PyObject
*
args
,
static
PyObject
*
tensor__
set_grad_type
(
TensorObject
*
self
,
PyObject
*
args
,
PyObject
*
kwargs
)
{
PyObject
*
kwargs
)
{
EAGER_TRY
EAGER_TRY
auto
var_type
=
pybind
::
CastPyArg2ProtoType
(
PyTuple_GET_ITEM
(
args
,
0
),
0
);
auto
var_type
=
pybind
::
CastPyArg2ProtoType
(
PyTuple_GET_ITEM
(
args
,
0
),
0
);
auto
grad_tensor
=
auto
grad_tensor
=
...
@@ -963,6 +969,42 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args,
...
@@ -963,6 +969,42 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL
EAGER_CATCH_AND_THROW_RETURN_NULL
}
}
static
PyObject
*
tensor__clear
(
TensorObject
*
self
,
PyObject
*
args
,
PyObject
*
kwargs
)
{
EAGER_TRY
self
->
tensor
.
reset
();
return
Py_None
;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static
PyObject
*
tensor__copy_gradient_from
(
TensorObject
*
self
,
PyObject
*
args
,
PyObject
*
kwargs
)
{
EAGER_TRY
auto
src
=
CastPyArg2Tensor
(
PyTuple_GET_ITEM
(
args
,
0
),
0
);
if
(
self
->
tensor
.
is_initialized
())
{
PADDLE_ENFORCE_EQ
(
self
->
tensor
.
dtype
(),
src
.
dtype
(),
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s has different data type with Tensor %s"
,
self
->
tensor
.
name
(),
src
.
name
()));
PADDLE_ENFORCE_EQ
(
self
->
tensor
.
impl
()
->
type_info
().
id
(),
src
.
impl
()
->
type_info
().
id
(),
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s has different type with Tensor %s, Tensor "
"ShareGradientDataWith cannot be performed!"
,
self
->
tensor
.
name
(),
src
.
name
()));
}
VLOG
(
6
)
<<
"Tensor copy gradient from: "
<<
src
.
name
();
auto
*
p_grad
=
egr
::
EagerUtils
::
mutable_grad
(
self
->
tensor
);
if
(
p_grad
)
{
PADDLE_ENFORCE_EQ
(
src
.
initialized
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Tensor %s has not been initialized"
,
src
.
name
()));
p_grad
->
set_impl
(
src
.
impl
());
}
Py_INCREF
(
Py_None
);
return
Py_None
;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static
PyObject
*
tensor_method_get_non_zero_indices
(
TensorObject
*
self
,
static
PyObject
*
tensor_method_get_non_zero_indices
(
TensorObject
*
self
,
PyObject
*
args
,
PyObject
*
args
,
PyObject
*
kwargs
)
{
PyObject
*
kwargs
)
{
...
@@ -1117,7 +1159,12 @@ PyMethodDef variable_methods[] = {
...
@@ -1117,7 +1159,12 @@ PyMethodDef variable_methods[] = {
{
"_register_backward_hook"
,
{
"_register_backward_hook"
,
(
PyCFunction
)(
void
(
*
)(
void
))
tensor_register_reduce_hook
,
(
PyCFunction
)(
void
(
*
)(
void
))
tensor_register_reduce_hook
,
METH_VARARGS
|
METH_KEYWORDS
,
NULL
},
METH_VARARGS
|
METH_KEYWORDS
,
NULL
},
{
"_set_grad_type"
,
(
PyCFunction
)(
void
(
*
)(
void
))
set_grad_type
,
{
"_set_grad_type"
,
(
PyCFunction
)(
void
(
*
)(
void
))
tensor__set_grad_type
,
METH_VARARGS
|
METH_KEYWORDS
,
NULL
},
{
"_clear"
,
(
PyCFunction
)(
void
(
*
)(
void
))
tensor__clear
,
METH_VARARGS
|
METH_KEYWORDS
,
NULL
},
{
"_copy_gradient_from"
,
(
PyCFunction
)(
void
(
*
)(
void
))
tensor__copy_gradient_from
,
METH_VARARGS
|
METH_KEYWORDS
,
NULL
},
METH_VARARGS
|
METH_KEYWORDS
,
NULL
},
/***the method of sparse tensor****/
/***the method of sparse tensor****/
{
"non_zero_indices"
,
{
"non_zero_indices"
,
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
31363c3f
...
@@ -655,6 +655,7 @@ void BindImperative(py::module *m_ptr) {
...
@@ -655,6 +655,7 @@ void BindImperative(py::module *m_ptr) {
}
else
{
}
else
{
act_name
=
name
.
cast
<
std
::
string
>
();
act_name
=
name
.
cast
<
std
::
string
>
();
}
}
VLOG
(
4
)
<<
"Init VarBase :"
<<
act_name
;
new
(
&
self
)
imperative
::
VarBase
(
act_name
);
new
(
&
self
)
imperative
::
VarBase
(
act_name
);
self
.
SetPersistable
(
persistable
);
self
.
SetPersistable
(
persistable
);
self
.
SetType
(
type
);
self
.
SetType
(
type
);
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
31363c3f
...
@@ -829,6 +829,8 @@ PYBIND11_MODULE(core_noavx, m) {
...
@@ -829,6 +829,8 @@ PYBIND11_MODULE(core_noavx, m) {
[](
const
framework
::
Tensor
&
self
)
{
[](
const
framework
::
Tensor
&
self
)
{
return
reinterpret_cast
<
uintptr_t
>
(
self
.
data
());
return
reinterpret_cast
<
uintptr_t
>
(
self
.
data
());
})
})
.
def
(
"_slice"
,
&
framework
::
Tensor
::
Slice
)
.
def
(
"_numel"
,
&
framework
::
Tensor
::
numel
)
.
def
(
"_is_initialized"
,
.
def
(
"_is_initialized"
,
[](
const
framework
::
Tensor
&
self
)
{
return
self
.
IsInitialized
();
})
[](
const
framework
::
Tensor
&
self
)
{
return
self
.
IsInitialized
();
})
.
def
(
"_get_dims"
,
.
def
(
"_get_dims"
,
...
...
paddle/phi/api/include/tensor.h
浏览文件 @
31363c3f
...
@@ -427,9 +427,7 @@ class PADDLE_API Tensor final {
...
@@ -427,9 +427,7 @@ class PADDLE_API Tensor final {
* @param blocking, Should we copy this in sync way.
* @param blocking, Should we copy this in sync way.
* @return void
* @return void
*/
*/
void
copy_
(
const
Tensor
&
src
,
void
copy_
(
const
Tensor
&
src
,
const
phi
::
Place
&
target_place
,
bool
blocking
);
const
phi
::
Place
&
target_place
,
const
bool
blocking
);
/**
/**
* @brief Cast datatype from one to another
* @brief Cast datatype from one to another
*
*
...
...
paddle/phi/api/lib/tensor_method.cc
浏览文件 @
31363c3f
...
@@ -84,26 +84,26 @@ void Tensor::copy_(const Tensor &src,
...
@@ -84,26 +84,26 @@ void Tensor::copy_(const Tensor &src,
if
(
is_initialized
())
{
if
(
is_initialized
())
{
PADDLE_ENFORCE_EQ
(
dtype
(),
PADDLE_ENFORCE_EQ
(
dtype
(),
src
.
dtype
(),
src
.
dtype
(),
p
latform
::
errors
::
PreconditionNotMet
(
p
hi
::
errors
::
PreconditionNotMet
(
"Tensor %s has different data type with Tensor %s, "
"Tensor %s has different data type with Tensor %s, "
"Tensor Copy cannot be performed!"
,
"Tensor Copy cannot be performed!"
,
name
(),
name
(),
src
.
name
()));
src
.
name
()));
PADDLE_ENFORCE_EQ
(
impl
()
->
type_info
().
id
(),
PADDLE_ENFORCE_EQ
(
impl
()
->
type_info
().
id
(),
src
.
impl
()
->
type_info
().
id
(),
src
.
impl
()
->
type_info
().
id
(),
p
latform
::
errors
::
PreconditionNotMet
(
p
hi
::
errors
::
PreconditionNotMet
(
"Tensor %s has different type with Tensor %s, Tensor "
"Tensor %s has different type with Tensor %s, Tensor "
"Copy cannot be performed!"
,
"Copy cannot be performed!"
,
name
(),
name
(),
src
.
name
()));
src
.
name
()));
PADDLE_ENFORCE_EQ
(
target_place
,
PADDLE_ENFORCE_EQ
(
target_place
,
inner_place
(),
inner_place
(),
p
latform
::
errors
::
PreconditionNotMet
(
p
hi
::
errors
::
PreconditionNotMet
(
"Place is different of dst tensor and args %s, which "
"Place is different of dst tensor and args %s, which "
"current tensor holds %s "
"current tensor holds %s "
"Copy cannot be performed!"
,
"Copy cannot be performed!"
,
target_place
.
DebugString
()
,
target_place
,
inner_place
()
.
DebugString
()
));
inner_place
()));
kernel_key_set
.
backend_set
=
kernel_key_set
.
backend_set
=
kernel_key_set
.
backend_set
|
kernel_key_set
.
backend_set
|
BackendSet
(
phi
::
TransToPhiBackend
(
inner_place
()));
BackendSet
(
phi
::
TransToPhiBackend
(
inner_place
()));
...
@@ -177,7 +177,7 @@ void Tensor::copy_(const Tensor &src,
...
@@ -177,7 +177,7 @@ void Tensor::copy_(const Tensor &src,
blocking
,
blocking
,
static_cast
<
phi
::
SelectedRows
*>
(
impl_
.
get
()));
static_cast
<
phi
::
SelectedRows
*>
(
impl_
.
get
()));
}
else
{
}
else
{
PADDLE_THROW
(
p
addle
::
platform
::
errors
::
InvalidArgument
(
PADDLE_THROW
(
p
hi
::
errors
::
InvalidArgument
(
"We currently only support dense tensor copy for now and if u need to "
"We currently only support dense tensor copy for now and if u need to "
"copy selected rows please raise a issue."
));
"copy selected rows please raise a issue."
));
}
}
...
...
paddle/phi/infermeta/multiary.cc
浏览文件 @
31363c3f
...
@@ -516,6 +516,215 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
...
@@ -516,6 +516,215 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
out
->
share_lod
(
*
x
.
at
(
0
));
out
->
share_lod
(
*
x
.
at
(
0
));
}
}
inline
int
ConvOutputSize
(
int
input_size
,
int
filter_size
,
int
dilation
,
int
padding
,
int
stride
)
{
const
int
dkernel
=
dilation
*
(
filter_size
-
1
)
+
1
;
int
output_size
=
(
input_size
+
2
*
padding
-
dkernel
)
/
stride
+
1
;
PADDLE_ENFORCE_GT
(
output_size
,
0
,
phi
::
errors
::
InvalidArgument
(
"The output's size is expected to be greater than 0. "
"But recieved: output's size is %d. The output's size is computed by "
"((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
"stride + 1), where input_size is %d, padding is %d, "
"filter_size is %d, dilation is %d, stride is %d."
,
output_size
,
input_size
,
padding
,
filter_size
,
dilation
,
stride
));
return
output_size
;
}
void
DeformableConvInferMeta
(
const
MetaTensor
&
x
,
const
MetaTensor
&
offset
,
const
MetaTensor
&
filter
,
paddle
::
optional
<
const
MetaTensor
&>
mask
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
deformable_groups
,
int
groups
,
int
im2col_step
,
MetaTensor
*
out
,
MetaConfig
config
)
{
auto
in_dims
=
x
.
dims
();
auto
offset_dims
=
offset
.
dims
();
auto
filter_dims
=
filter
.
dims
();
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
4
,
phi
::
errors
::
InvalidArgument
(
"Conv input should be 4-D tensor, get %u"
,
in_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
filter_dims
.
size
(),
phi
::
errors
::
InvalidArgument
(
"Conv input dimension and filter dimension should be "
"the same. The difference is [%d]: [%d]"
,
in_dims
.
size
(),
filter_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
.
size
()
-
strides
.
size
(),
2U
,
phi
::
errors
::
InvalidArgument
(
"Conv input dimension and strides "
"dimension should be consistent. But received input "
"dimension:[%d], strides dimension:[%d]"
,
in_dims
.
size
(),
strides
.
size
()));
PADDLE_ENFORCE_EQ
(
paddings
.
size
(),
strides
.
size
(),
phi
::
errors
::
InvalidArgument
(
"Conv paddings dimension and Conv strides dimension "
"should be the same. The difference is [%d]: [%d]"
,
paddings
.
size
(),
strides
.
size
()));
PADDLE_ENFORCE_EQ
(
in_dims
[
1
],
filter_dims
[
1
]
*
groups
,
phi
::
errors
::
InvalidArgument
(
"The number of input channels should be equal to filter "
"channels * groups. The difference is [%d]: [%d]"
,
in_dims
[
1
],
filter_dims
[
1
]
*
groups
));
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
]
%
groups
,
0
,
phi
::
errors
::
InvalidArgument
(
"The number of output channels should be divided by groups. But "
"received output channels:[%d], groups:[%d]"
,
filter_dims
[
0
],
groups
));
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
]
%
deformable_groups
,
0
,
phi
::
errors
::
InvalidArgument
(
"The number of output channels should be "
"divided by deformable groups. The difference is [%d]: [%d]"
,
filter_dims
[
0
]
%
groups
,
0
));
if
(
in_dims
[
0
]
>
im2col_step
)
{
PADDLE_ENFORCE_EQ
(
in_dims
[
0
]
%
im2col_step
,
0U
,
phi
::
errors
::
InvalidArgument
(
"Input batchsize must be smaller than or divide im2col_step. But "
"received Input batchsize:[%d], im2col_step:[%d]"
,
in_dims
[
0
],
im2col_step
));
}
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
PADDLE_ENFORCE_GT
(
strides
[
i
],
0U
,
phi
::
errors
::
InvalidArgument
(
"stride %d size incorrect"
,
i
));
}
for
(
size_t
i
=
0
;
i
<
dilations
.
size
();
++
i
)
{
PADDLE_ENFORCE_GT
(
dilations
[
i
],
0U
,
phi
::
errors
::
InvalidArgument
(
"dilation %d size incorrect"
,
i
));
}
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
if
(
!
config
.
is_runtime
&&
(
in_dims
[
i
+
2
]
<=
0
||
filter_dims
[
i
+
2
]
<=
0
))
{
output_shape
.
push_back
(
-
1
);
}
else
{
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
}
PADDLE_ENFORCE_EQ
(
output_shape
[
1
]
%
deformable_groups
,
0U
,
phi
::
errors
::
InvalidArgument
(
"output num_filter must divide deformable group size. But received "
"output num_filter:[%d], deformable group size:[%d]"
,
output_shape
[
1
],
deformable_groups
));
if
(
config
.
is_runtime
)
{
PADDLE_ENFORCE_EQ
(
output_shape
[
2
],
offset_dims
[
2
],
phi
::
errors
::
InvalidArgument
(
"output height must equal to offset map height. "
"The difference is [%d]: [%d]"
,
output_shape
[
2
],
offset_dims
[
2
]));
PADDLE_ENFORCE_EQ
(
output_shape
[
3
],
offset_dims
[
3
],
phi
::
errors
::
InvalidArgument
(
"output width must equal to offset map width. The "
"difference is [%d]: [%d]"
,
output_shape
[
3
],
offset_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
offset_dims
[
1
]
%
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
0U
,
phi
::
errors
::
InvalidArgument
(
"offset filter must divide deformable group size. "
"But received [%d]: [%d]"
,
offset_dims
[
1
],
filter_dims
[
2
]
*
filter_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
offset_dims
[
1
]
/
(
2
*
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
,
phi
::
errors
::
InvalidArgument
(
"offset filter must divide deformable group size. But received "
"[%d]: [%d]"
,
offset_dims
[
1
]
/
(
2
*
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
));
if
(
mask
)
{
auto
mask_dims
=
mask
->
dims
();
PADDLE_ENFORCE_EQ
(
output_shape
[
2
],
mask_dims
[
2
],
phi
::
errors
::
InvalidArgument
(
"output height must equal to mask map height. The "
"difference is [%d] vs [%d]"
,
output_shape
[
2
],
mask_dims
[
2
]));
PADDLE_ENFORCE_EQ
(
output_shape
[
3
],
mask_dims
[
3
],
phi
::
errors
::
InvalidArgument
(
"output width must equal to mask map width. The "
"difference is [%d] vs [%d]"
,
output_shape
[
3
],
mask_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
mask_dims
[
1
]
%
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
0U
,
phi
::
errors
::
InvalidArgument
(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]"
,
mask_dims
[
1
],
filter_dims
[
2
]
*
filter_dims
[
3
]));
PADDLE_ENFORCE_EQ
(
mask_dims
[
1
]
/
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
,
phi
::
errors
::
InvalidArgument
(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]"
,
mask_dims
[
1
]
/
(
filter_dims
[
2
]
*
filter_dims
[
3
]),
deformable_groups
));
}
}
out
->
set_dims
(
phi
::
make_ddim
(
output_shape
));
out
->
set_dtype
(
x
.
dtype
());
}
void
HierarchicalSigmoidInferMeta
(
const
MetaTensor
&
x
,
void
HierarchicalSigmoidInferMeta
(
const
MetaTensor
&
x
,
const
MetaTensor
&
w
,
const
MetaTensor
&
w
,
const
MetaTensor
&
label
,
const
MetaTensor
&
label
,
...
...
paddle/phi/infermeta/multiary.h
浏览文件 @
31363c3f
...
@@ -120,6 +120,19 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
...
@@ -120,6 +120,19 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
MetaTensor
*
out
,
MetaTensor
*
out
,
MetaConfig
config
=
MetaConfig
());
MetaConfig
config
=
MetaConfig
());
void
DeformableConvInferMeta
(
const
MetaTensor
&
x
,
const
MetaTensor
&
offset
,
const
MetaTensor
&
filter
,
paddle
::
optional
<
const
MetaTensor
&>
mask
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
deformable_groups
,
int
groups
,
int
im2col_step
,
MetaTensor
*
out
,
MetaConfig
config
=
MetaConfig
());
void
HierarchicalSigmoidInferMeta
(
const
MetaTensor
&
x
,
void
HierarchicalSigmoidInferMeta
(
const
MetaTensor
&
x
,
const
MetaTensor
&
w
,
const
MetaTensor
&
w
,
const
MetaTensor
&
label
,
const
MetaTensor
&
label
,
...
...
paddle/phi/infermeta/unary.cc
浏览文件 @
31363c3f
...
@@ -352,6 +352,14 @@ void FlattenInferMeta(const MetaTensor& x,
...
@@ -352,6 +352,14 @@ void FlattenInferMeta(const MetaTensor& x,
int
start_axis
,
int
start_axis
,
int
stop_axis
,
int
stop_axis
,
MetaTensor
*
out
)
{
MetaTensor
*
out
)
{
FlattenWithXShapeInferMeta
(
x
,
start_axis
,
stop_axis
,
out
,
nullptr
);
}
void
FlattenWithXShapeInferMeta
(
const
MetaTensor
&
x
,
int
start_axis
,
int
stop_axis
,
MetaTensor
*
out
,
MetaTensor
*
xshape
)
{
auto
x_dims
=
x
.
dims
();
auto
x_dims
=
x
.
dims
();
int
in_dims_size
=
x_dims
.
size
();
int
in_dims_size
=
x_dims
.
size
();
if
(
start_axis
<
0
)
{
if
(
start_axis
<
0
)
{
...
@@ -394,6 +402,14 @@ void FlattenInferMeta(const MetaTensor& x,
...
@@ -394,6 +402,14 @@ void FlattenInferMeta(const MetaTensor& x,
// are the same.
// are the same.
out
->
share_lod
(
x
);
out
->
share_lod
(
x
);
}
}
if
(
xshape
==
nullptr
)
return
;
std
::
vector
<
int64_t
>
xshape_dims
(
x_dims
.
size
()
+
1
);
xshape_dims
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
x_dims
.
size
();
++
i
)
{
xshape_dims
[
i
+
1
]
=
x_dims
[
i
];
}
xshape
->
set_dims
(
phi
::
make_ddim
(
xshape_dims
));
xshape
->
share_lod
(
x
);
}
}
void
GumbelSoftmaxInferMeta
(
const
MetaTensor
&
x
,
void
GumbelSoftmaxInferMeta
(
const
MetaTensor
&
x
,
...
...
paddle/phi/infermeta/unary.h
浏览文件 @
31363c3f
...
@@ -86,6 +86,12 @@ void FlattenInferMeta(const MetaTensor& x,
...
@@ -86,6 +86,12 @@ void FlattenInferMeta(const MetaTensor& x,
int
stop_axis
,
int
stop_axis
,
MetaTensor
*
out
);
MetaTensor
*
out
);
void
FlattenWithXShapeInferMeta
(
const
MetaTensor
&
x
,
int
start_axis
,
int
stop_axis
,
MetaTensor
*
out
,
MetaTensor
*
xshape
);
void
GumbelSoftmaxInferMeta
(
const
MetaTensor
&
x
,
void
GumbelSoftmaxInferMeta
(
const
MetaTensor
&
x
,
float
temperature
,
float
temperature
,
bool
hard
,
bool
hard
,
...
...
paddle/phi/kernels/CMakeLists.txt
浏览文件 @
31363c3f
...
@@ -27,12 +27,14 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
...
@@ -27,12 +27,14 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
# Some kernels depend on some targets that are not commonly used.
# Some kernels depend on some targets that are not commonly used.
# These targets are not suitable for common dependencies.
# These targets are not suitable for common dependencies.
# In this case, you need to manually generate them here.
# In this case, you need to manually generate them here.
set
(
MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel
set
(
MANUAL_BUILD_KERNELS
deformable_conv_kernel deformable_conv_grad_kernel
eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel
hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel
)
triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel
)
kernel_library
(
deformable_conv_kernel DEPS
${
COMMON_KERNEL_DEPS
}
deformable_conv_functor
)
kernel_library
(
deformable_conv_grad_kernel DEPS
${
COMMON_KERNEL_DEPS
}
deformable_conv_functor
)
kernel_library
(
eigh_kernel DEPS
${
COMMON_KERNEL_DEPS
}
lapack_function
)
kernel_library
(
eigh_kernel DEPS
${
COMMON_KERNEL_DEPS
}
lapack_function
)
kernel_library
(
hierarchical_sigmoid_kernel DEPS
${
COMMON_KERNEL_DEPS
}
matrix_bit_code
)
kernel_library
(
hierarchical_sigmoid_kernel DEPS
${
COMMON_KERNEL_DEPS
}
matrix_bit_code
)
kernel_library
(
hierarchical_sigmoid_grad_kernel DEPS
${
COMMON_KERNEL_DEPS
}
matrix_bit_code
)
kernel_library
(
hierarchical_sigmoid_grad_kernel DEPS
${
COMMON_KERNEL_DEPS
}
matrix_bit_code
)
...
...
paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
0 → 100644
浏览文件 @
31363c3f
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
namespace
phi
{
template
<
typename
T
>
inline
void
ModulatedDeformableCol2imCPUKernel
(
const
int
num_kernels
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_im
)
{
for
(
int
thread
=
0
;
thread
<
num_kernels
;
thread
++
)
{
const
int
j
=
(
thread
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
thread
%
width_col
;
int
h_out
=
(
thread
/
width_col
)
%
height_col
;
int
b
=
(
thread
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
T
cur_top_grad
=
data_col
[
thread
];
if
(
data_mask
)
{
const
T
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
cur_top_grad
*=
mask
;
}
const
int
cur_h
=
static_cast
<
int
>
(
cur_inv_h_data
);
const
int
cur_w
=
static_cast
<
int
>
(
cur_inv_w_data
);
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
T
weight
=
DmcnGetGradientWeight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
*
(
grad_im
+
cur_bottom_grad_pos
)
=
*
(
grad_im
+
cur_bottom_grad_pos
)
+
weight
*
cur_top_grad
;
}
}
}
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableCol2im
(
const
Context
&
dev_ctx
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
kernel_shape
,
const
std
::
vector
<
int
>&
pad
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
dilation
,
const
int
deformable_group
,
T
*
grad_im
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_group
;
int
num_kernels
=
col_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
ModulatedDeformableCol2imCPUKernel
(
num_kernels
,
data_col
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
pad
[
0
],
pad
[
1
],
stride
[
0
],
stride
[
1
],
dilation
[
0
],
dilation
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
deformable_group
,
col_shape
[
2
],
col_shape
[
3
],
grad_im
);
}
template
<
typename
T
>
void
ModulatedDeformableCol2imCoordCPUKernel
(
const
int
num_kernels
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_offset
,
T
*
grad_mask
)
{
for
(
int
i
=
0
;
i
<
num_kernels
;
i
++
)
{
T
val
=
0
,
mval
=
0
;
const
int
w
=
i
%
width_col
;
const
int
h
=
(
i
/
width_col
)
%
height_col
;
const
int
c
=
(
i
/
width_col
/
height_col
)
%
offset_channels
;
const
int
b
=
(
i
/
width_col
/
height_col
)
/
offset_channels
;
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
T
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
?
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
:
nullptr
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
offset_c
/
2
;
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
T
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
else
{
mval
+=
data_col_ptr
[
col_pos
]
*
funcs
::
DmcnIm2colBilinear
(
data_im_ptr
+
cnt
*
height
*
width
,
width
,
height
,
width
,
inv_h
,
inv_w
);
}
const
T
weight
=
DmcnGetCoordinateWeight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
if
(
data_mask_ptr
)
{
const
int
data_mask_hw_ptr
=
(((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
val
+=
weight
*
data_col_ptr
[
col_pos
]
*
mask
;
}
else
{
val
+=
weight
*
data_col_ptr
[
col_pos
];
}
cnt
+=
1
;
}
grad_offset
[
i
]
=
val
;
if
(
grad_mask
&&
offset_c
%
2
==
0
)
grad_mask
[(((
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
+
offset_c
/
2
)
*
height_col
+
h
)
*
width_col
+
w
]
=
mval
;
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableCol2imCoord
(
const
Context
&
dev_ctx
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
kernel_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
grad_offset
,
T
*
grad_mask
)
{
int
num_kernels
=
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
]
*
deformable_groups
;
int
channel_per_deformable_group
=
col_shape
[
0
]
/
deformable_groups
;
ModulatedDeformableCol2imCoordCPUKernel
(
num_kernels
,
data_col
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
deformable_groups
,
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
grad_offset
,
grad_mask
);
}
template
<
typename
T
,
typename
Context
>
void
FilterGradAddup
(
const
Context
&
dev_ctx
,
const
int
nthreads
,
const
int
n
,
const
int
height
,
const
int
width
,
const
T
*
dweight_3d
,
T
*
filter_grad
)
{
for
(
int
i
=
0
;
i
<
nthreads
;
i
++
)
{
filter_grad
[
i
]
=
filter_grad
[
i
]
+
dweight_3d
[
i
];
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
deformable_conv_grad
,
CPU
,
ALL_LAYOUT
,
phi
::
DeformableConvGradKernel
,
float
,
double
)
{}
paddle/phi/kernels/cpu/deformable_conv_kernel.cc
浏览文件 @
31363c3f
...
@@ -18,126 +18,6 @@
...
@@ -18,126 +18,6 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
namespace
phi
{
template
<
typename
T
>
inline
void
ModulatedDeformableIm2colCPUKernel
(
const
int
num_kernels
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
for
(
int
i
=
0
;
i
<
num_kernels
;
i
++
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
*
mask
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableIm2col
(
const
Context
&
dev_ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
// get outputs of im2col with offset by bilinear interpolation
ModulatedDeformableIm2colCPUKernel
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
deformable_conv
,
PD_REGISTER_KERNEL
(
deformable_conv
,
CPU
,
CPU
,
ALL_LAYOUT
,
ALL_LAYOUT
,
...
...
paddle/phi/kernels/deformable_conv_grad_kernel.h
0 → 100644
浏览文件 @
31363c3f
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
DeformableConvGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
offset
,
const
DenseTensor
&
filter
,
paddle
::
optional
<
const
DenseTensor
&>
mask
,
const
DenseTensor
&
out_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
deformable_groups
,
int
groups
,
int
im2col_step
,
DenseTensor
*
dx
,
DenseTensor
*
offset_grad
,
DenseTensor
*
filter_grad
,
DenseTensor
*
mask_grad
);
}
// namespace phi
paddle/phi/kernels/deformable_conv_kernel.h
浏览文件 @
31363c3f
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#pragma once
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/utils/optional.h"
namespace
phi
{
namespace
phi
{
...
@@ -23,7 +24,7 @@ void DeformableConvKernel(const Context& dev_ctx,
...
@@ -23,7 +24,7 @@ void DeformableConvKernel(const Context& dev_ctx,
const
DenseTensor
&
x
,
const
DenseTensor
&
x
,
const
DenseTensor
&
offset
,
const
DenseTensor
&
offset
,
const
DenseTensor
&
filter
,
const
DenseTensor
&
filter
,
const
DenseTensor
&
mask
,
paddle
::
optional
<
const
DenseTensor
&>
mask
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
dilations
,
...
...
paddle/phi/kernels/flatten_grad_kernel.cc
浏览文件 @
31363c3f
...
@@ -25,6 +25,7 @@ void FlattenGradKernel(const Context& dev_ctx,
...
@@ -25,6 +25,7 @@ void FlattenGradKernel(const Context& dev_ctx,
const
DenseTensor
&
xshape
,
const
DenseTensor
&
xshape
,
DenseTensor
*
x_grad
)
{
DenseTensor
*
x_grad
)
{
auto
xshape_dims
=
xshape
.
dims
();
auto
xshape_dims
=
xshape
.
dims
();
dev_ctx
.
Alloc
(
x_grad
,
out_grad
.
dtype
());
auto
x_dims
=
phi
::
slice_ddim
(
xshape_dims
,
1
,
xshape_dims
.
size
());
auto
x_dims
=
phi
::
slice_ddim
(
xshape_dims
,
1
,
xshape_dims
.
size
());
phi
::
Copy
(
dev_ctx
,
out_grad
,
dev_ctx
.
GetPlace
(),
false
,
x_grad
);
phi
::
Copy
(
dev_ctx
,
out_grad
,
dev_ctx
.
GetPlace
(),
false
,
x_grad
);
x_grad
->
Resize
(
x_dims
);
x_grad
->
Resize
(
x_dims
);
...
...
paddle/phi/kernels/flatten_kernel.cc
浏览文件 @
31363c3f
...
@@ -27,6 +27,7 @@ void FlattenKernel(const Context& dev_ctx,
...
@@ -27,6 +27,7 @@ void FlattenKernel(const Context& dev_ctx,
int
start_axis
,
int
start_axis
,
int
stop_axis
,
int
stop_axis
,
DenseTensor
*
out
)
{
DenseTensor
*
out
)
{
dev_ctx
.
Alloc
(
out
,
x
.
dtype
());
auto
out_dims
=
out
->
dims
();
auto
out_dims
=
out
->
dims
();
phi
::
Copy
(
dev_ctx
,
x
,
dev_ctx
.
GetPlace
(),
false
,
out
);
phi
::
Copy
(
dev_ctx
,
x
,
dev_ctx
.
GetPlace
(),
false
,
out
);
out
->
Resize
(
out_dims
);
out
->
Resize
(
out_dims
);
...
@@ -43,7 +44,6 @@ void FlattenWithXShape(const Context& dev_ctx,
...
@@ -43,7 +44,6 @@ void FlattenWithXShape(const Context& dev_ctx,
DenseTensor
*
out
,
DenseTensor
*
out
,
DenseTensor
*
xshape
)
{
DenseTensor
*
xshape
)
{
FlattenKernel
<
T
,
Context
>
(
dev_ctx
,
x
,
start_axis
,
stop_axis
,
out
);
FlattenKernel
<
T
,
Context
>
(
dev_ctx
,
x
,
start_axis
,
stop_axis
,
out
);
funcs
::
SetXShape
(
x
,
xshape
);
}
}
}
// namespace phi
}
// namespace phi
...
...
paddle/phi/kernels/funcs/CMakeLists.txt
浏览文件 @
31363c3f
...
@@ -3,6 +3,7 @@ add_subdirectory(blas)
...
@@ -3,6 +3,7 @@ add_subdirectory(blas)
add_subdirectory
(
lapack
)
add_subdirectory
(
lapack
)
add_subdirectory
(
detail
)
add_subdirectory
(
detail
)
math_library
(
deformable_conv_functor DEPS dense_tensor
)
math_library
(
concat_and_split_functor DEPS dense_tensor
)
math_library
(
concat_and_split_functor DEPS dense_tensor
)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
lstm_compute DEPS activation_functions
)
math_library
(
lstm_compute DEPS activation_functions
)
...
...
paddle/phi/kernels/funcs/deformable_conv_functor.cc
0 → 100644
浏览文件 @
31363c3f
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace
phi
{
namespace
funcs
{
template
<
typename
T
>
inline
void
ModulatedDeformableIm2colCPUKernel
(
const
int
num_kernels
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
for
(
int
i
=
0
;
i
<
num_kernels
;
i
++
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
?
data_mask
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
:
nullptr
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
;
if
(
data_mask_ptr
)
{
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
*
data_col_ptr
*=
mask
;
}
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableIm2col
(
const
Context
&
dev_ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
// get outputs of im2col with offset by bilinear interpolation
ModulatedDeformableIm2colCPUKernel
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
template
void
ModulatedDeformableIm2col
(
const
phi
::
CPUContext
&
dev_ctx
,
const
float
*
data_im
,
const
float
*
data_offset
,
const
float
*
data_mask
,
const
std
::
vector
<
int64_t
>
&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
float
*
data_col
);
template
void
ModulatedDeformableIm2col
(
const
phi
::
CPUContext
&
dev_ctx
,
const
double
*
data_im
,
const
double
*
data_offset
,
const
double
*
data_mask
,
const
std
::
vector
<
int64_t
>
&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
double
*
data_col
);
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/funcs/deformable_conv_functor.cu
0 → 100644
浏览文件 @
31363c3f
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace
phi
{
namespace
funcs
{
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaximumNumBlocks
=
4096
;
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaximumNumBlocks
);
}
template
<
typename
T
>
__global__
void
ModulatedDeformableIm2colGpuKernel
(
const
int
nthreads
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
?
data_mask
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
:
nullptr
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
;
if
(
data_mask_ptr
)
{
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
*
data_col_ptr
*=
mask
;
}
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableIm2col
(
const
Context
&
dev_ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
int
blocks
=
NumBlocks
(
num_kernels
);
int
threads
=
kNumCUDAThreads
;
ModulatedDeformableIm2colGpuKernel
<
T
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
template
void
ModulatedDeformableIm2col
(
const
phi
::
GPUContext
&
dev_ctx
,
const
float
*
data_im
,
const
float
*
data_offset
,
const
float
*
data_mask
,
const
std
::
vector
<
int64_t
>
&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
float
*
data_col
);
template
void
ModulatedDeformableIm2col
(
const
phi
::
GPUContext
&
dev_ctx
,
const
double
*
data_im
,
const
double
*
data_offset
,
const
double
*
data_mask
,
const
std
::
vector
<
int64_t
>
&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
double
*
data_col
);
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/funcs/deformable_conv_functor.h
0 → 100644
浏览文件 @
31363c3f
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
namespace
funcs
{
template
<
typename
T
>
HOSTDEVICE
T
DmcnIm2colBilinear
(
const
T
*
bottom_data
,
const
int
data_width
,
const
int
height
,
const
int
width
,
T
h
,
T
w
)
{
int
h_low
=
floor
(
h
);
int
w_low
=
floor
(
w
);
int
h_high
=
h_low
+
1
;
int
w_high
=
w_low
+
1
;
T
lh
=
h
-
h_low
;
T
lw
=
w
-
w_low
;
T
hh
=
1
-
lh
;
T
hw
=
1
-
lw
;
T
v1
=
(
h_low
>=
0
&&
w_low
>=
0
)
?
bottom_data
[
h_low
*
data_width
+
w_low
]
:
0
;
T
v2
=
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
?
bottom_data
[
h_low
*
data_width
+
w_high
]
:
0
;
T
v3
=
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
?
bottom_data
[
h_high
*
data_width
+
w_low
]
:
0
;
T
v4
=
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
?
bottom_data
[
h_high
*
data_width
+
w_high
]
:
0
;
T
w1
=
hh
*
hw
;
T
w2
=
hh
*
lw
;
T
w3
=
lh
*
hw
;
T
w4
=
lh
*
lw
;
return
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
;
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableIm2col
(
const
Context
&
dev_ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
data_col
);
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
0 → 100644
浏览文件 @
31363c3f
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
namespace
phi
{
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaximumNumBlocks
=
4096
;
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaximumNumBlocks
);
}
template
<
typename
T
>
__global__
void
ModulatedDeformableCol2imGpuKernel
(
const
int
nthreads
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_im
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
thread
=
index
;
thread
<
nthreads
;
thread
+=
offset
)
{
const
int
j
=
(
thread
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
thread
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
thread
%
width_col
;
int
h_out
=
(
thread
/
width_col
)
%
height_col
;
int
b
=
(
thread
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
T
cur_top_grad
=
data_col
[
thread
];
if
(
data_mask
)
{
const
T
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
cur_top_grad
*=
mask
;
}
const
int
cur_h
=
static_cast
<
int
>
(
cur_inv_h_data
);
const
int
cur_w
=
static_cast
<
int
>
(
cur_inv_w_data
);
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
T
weight
=
DmcnGetGradientWeight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
paddle
::
platform
::
CudaAtomicAdd
(
grad_im
+
cur_bottom_grad_pos
,
weight
*
cur_top_grad
);
}
}
}
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableCol2im
(
const
Context
&
dev_ctx
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
kernel_shape
,
const
std
::
vector
<
int
>&
pad
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
dilation
,
const
int
deformable_group
,
T
*
grad_im
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_group
;
int
num_kernels
=
col_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
int
blocks
=
NumBlocks
(
num_kernels
);
int
threads
=
kNumCUDAThreads
;
ModulatedDeformableCol2imGpuKernel
<
T
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
num_kernels
,
data_col
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
pad
[
0
],
pad
[
1
],
stride
[
0
],
stride
[
1
],
dilation
[
0
],
dilation
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
deformable_group
,
col_shape
[
2
],
col_shape
[
3
],
grad_im
);
}
template
<
typename
T
>
__global__
void
ModulatedDeformableCol2imCoordGpuKernel
(
const
int
nthreads
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
grad_offset
,
T
*
grad_mask
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
T
val
=
0
,
mval
=
0
;
const
int
w
=
i
%
width_col
;
const
int
h
=
(
i
/
width_col
)
%
height_col
;
const
int
c
=
(
i
/
width_col
/
height_col
)
%
offset_channels
;
const
int
b
=
(
i
/
width_col
/
height_col
)
/
offset_channels
;
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
T
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
?
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
:
nullptr
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
offset_c
/
2
;
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
T
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
T
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
else
{
mval
+=
data_col_ptr
[
col_pos
]
*
funcs
::
DmcnIm2colBilinear
(
data_im_ptr
+
cnt
*
height
*
width
,
width
,
height
,
width
,
inv_h
,
inv_w
);
}
const
T
weight
=
DmcnGetCoordinateWeight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
if
(
data_mask_ptr
)
{
const
int
data_mask_hw_ptr
=
(((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
val
+=
weight
*
data_col_ptr
[
col_pos
]
*
mask
;
}
else
{
val
+=
weight
*
data_col_ptr
[
col_pos
];
}
cnt
+=
1
;
}
grad_offset
[
i
]
=
val
;
if
(
grad_mask
&&
offset_c
%
2
==
0
)
grad_mask
[(((
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
+
offset_c
/
2
)
*
height_col
+
h
)
*
width_col
+
w
]
=
mval
;
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableCol2imCoord
(
const
Context
&
dev_ctx
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
kernel_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
grad_offset
,
T
*
grad_mask
)
{
int
num_kernels
=
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
]
*
deformable_groups
;
int
channel_per_deformable_group
=
col_shape
[
0
]
/
deformable_groups
;
int
blocks
=
NumBlocks
(
num_kernels
);
int
threads
=
kNumCUDAThreads
;
ModulatedDeformableCol2imCoordGpuKernel
<
T
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
num_kernels
,
data_col
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
0
],
im_shape
[
1
],
im_shape
[
2
],
kernel_shape
[
2
],
kernel_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
2
*
kernel_shape
[
2
]
*
kernel_shape
[
3
]
*
deformable_groups
,
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
grad_offset
,
grad_mask
);
}
template
<
typename
T
>
__global__
void
FilterGradAddupGpuKernel
(
const
int
nthreads
,
const
int
n
,
const
int
height
,
const
int
width
,
const
T
*
dweight_3d
,
T
*
filter_grad
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
filter_grad
[
i
]
=
filter_grad
[
i
]
+
dweight_3d
[
i
];
}
}
template
<
typename
T
,
typename
Context
>
void
FilterGradAddup
(
const
Context
&
dev_ctx
,
const
int
nthreads
,
const
int
n
,
const
int
height
,
const
int
width
,
const
T
*
dweight_3d
,
T
*
filter_grad
)
{
FilterGradAddupGpuKernel
<
T
><<<
NumBlocks
(
nthreads
),
kNumCUDAThreads
,
0
,
dev_ctx
.
stream
()
>>>
(
nthreads
,
n
,
height
,
width
,
dweight_3d
,
filter_grad
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
deformable_conv_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
DeformableConvGradKernel
,
float
,
double
)
{}
paddle/phi/kernels/gpu/deformable_conv_kernel.cu
浏览文件 @
31363c3f
...
@@ -16,142 +16,8 @@
...
@@ -16,142 +16,8 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
namespace
phi
{
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaximumNumBlocks
=
4096
;
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaximumNumBlocks
);
}
template
<
typename
T
>
__global__
void
ModulatedDeformableIm2colGpuKernel
(
const
int
nthreads
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
T
*
data_col
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
const
int
w_col
=
i
%
width_col
;
const
int
h_col
=
(
i
/
width_col
)
%
height_col
;
const
int
b_col
=
(
i
/
width_col
)
/
height_col
%
batch_size
;
const
int
c_im
=
(
i
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
T
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
T
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
T
*
data_mask_ptr
=
data_mask
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
T
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
T
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
T
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
T
val
=
static_cast
<
T
>
(
0
);
const
T
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
T
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
val
=
DmcnIm2colBilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
*
mask
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableIm2col
(
const
Context
&
dev_ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
data_col
)
{
int
channel_per_deformable_group
=
im_shape
[
0
]
/
deformable_groups
;
int
num_kernels
=
im_shape
[
0
]
*
col_shape
[
1
]
*
col_shape
[
2
]
*
col_shape
[
3
];
int
blocks
=
NumBlocks
(
num_kernels
);
int
threads
=
kNumCUDAThreads
;
ModulatedDeformableIm2colGpuKernel
<
T
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
im_shape
[
1
],
im_shape
[
2
],
filter_shape
[
2
],
filter_shape
[
3
],
paddings
[
0
],
paddings
[
1
],
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
channel_per_deformable_group
,
col_shape
[
1
],
im_shape
[
0
],
deformable_groups
,
col_shape
[
2
],
col_shape
[
3
],
data_col
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
deformable_conv
,
PD_REGISTER_KERNEL
(
deformable_conv
,
GPU
,
GPU
,
ALL_LAYOUT
,
ALL_LAYOUT
,
...
...
paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
0 → 100644
浏览文件 @
31363c3f
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
namespace
phi
{
template
<
typename
T
>
HOSTDEVICE
T
DmcnGetGradientWeight
(
T
argmax_h
,
T
argmax_w
,
const
int
h
,
const
int
w
,
const
int
height
,
const
int
width
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
T
weight
=
0
;
weight
=
(
h
==
argmax_h_low
&&
w
==
argmax_w_low
)
?
(
h
+
1
-
argmax_h
)
*
(
w
+
1
-
argmax_w
)
:
weight
;
weight
=
(
h
==
argmax_h_low
&&
w
==
argmax_w_high
)
?
(
h
+
1
-
argmax_h
)
*
(
argmax_w
+
1
-
w
)
:
weight
;
weight
=
(
h
==
argmax_h_high
&&
w
==
argmax_w_low
)
?
(
argmax_h
+
1
-
h
)
*
(
w
+
1
-
argmax_w
)
:
weight
;
weight
=
(
h
==
argmax_h_high
&&
w
==
argmax_w_high
)
?
(
argmax_h
+
1
-
h
)
*
(
argmax_w
+
1
-
w
)
:
weight
;
return
weight
;
}
template
<
typename
T
>
HOSTDEVICE
T
DmcnGetCoordinateWeight
(
T
argmax_h
,
T
argmax_w
,
const
int
height
,
const
int
width
,
const
T
*
im_data
,
const
int
data_width
,
const
int
bp_dir
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
T
weight
=
0
;
if
(
bp_dir
==
0
)
{
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
?
-
1
*
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
?
-
1
*
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
?
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
?
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
]
:
0
;
}
else
if
(
bp_dir
==
1
)
{
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
?
-
1
*
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
?
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
?
-
1
*
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
]
:
0
;
weight
+=
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
?
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
]
:
0
;
}
return
weight
;
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableCol2imCoord
(
const
Context
&
dev_ctx
,
const
T
*
data_col
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
kernel_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
grad_offset
,
T
*
grad_mask
);
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableCol2im
(
const
Context
&
dev_ctx
,
const
T
*
data_col
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
kernel_shape
,
const
std
::
vector
<
int
>&
pad
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
dilation
,
const
int
deformable_group
,
T
*
grad_im
);
template
<
typename
T
,
typename
Context
>
void
FilterGradAddup
(
const
Context
&
dev_ctx
,
const
int
nthreads
,
const
int
n
,
const
int
height
,
const
int
width
,
const
T
*
dweight_3d
,
T
*
filter_grad
);
template
<
typename
T
,
typename
Context
>
void
DeformableConvGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
offset
,
const
DenseTensor
&
filter
,
paddle
::
optional
<
const
DenseTensor
&>
mask
,
const
DenseTensor
&
out_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
deformable_groups
,
int
groups
,
int
im2col_step
,
DenseTensor
*
dx
,
DenseTensor
*
offset_grad
,
DenseTensor
*
filter_grad
,
DenseTensor
*
mask_grad
)
{
const
int
batch_size
=
static_cast
<
int
>
(
x
.
dims
()[
0
]);
DDim
input_shape
=
phi
::
slice_ddim
(
x
.
dims
(),
1
,
x
.
dims
().
size
());
std
::
vector
<
int64_t
>
input_shape_vec
=
phi
::
vectorize
(
input_shape
);
std
::
vector
<
int64_t
>
filter_shape_vec
(
phi
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
phi
::
vectorize
(
out_grad
.
dims
()));
std
::
vector
<
int64_t
>
col_buffer_shape_vec
(
filter_shape_vec
.
size
());
col_buffer_shape_vec
[
0
]
=
x
.
dims
()[
1
]
*
filter
.
dims
()[
2
]
*
filter
.
dims
()[
3
];
col_buffer_shape_vec
[
1
]
=
im2col_step
;
for
(
size_t
j
=
0
;
j
<
filter_shape_vec
.
size
()
-
2
;
++
j
)
{
col_buffer_shape_vec
[
j
+
2
]
=
output_shape_vec
[
j
+
2
];
}
std
::
vector
<
int64_t
>
output_buffer_shape_vec
(
1
);
output_buffer_shape_vec
[
0
]
=
batch_size
*
output_shape_vec
[
1
]
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
DenseTensor
col_buffer
=
Empty
<
T
>
(
dev_ctx
,
col_buffer_shape_vec
);
DenseTensor
output_buffer
;
output_buffer
.
ShareDataWith
(
out_grad
).
Resize
(
make_ddim
(
output_buffer_shape_vec
));
int64_t
M
=
input_shape_vec
[
0
]
/
groups
*
filter_shape_vec
[
2
]
*
filter_shape_vec
[
3
];
int64_t
N
=
im2col_step
*
output_shape_vec
[
2
]
*
output_shape_vec
[
3
];
int64_t
K
=
output_shape_vec
[
1
]
/
groups
;
DDim
weight_3d_shape
=
{
groups
,
K
,
M
};
DDim
out_grad_4d_shape
=
{
batch_size
/
im2col_step
,
groups
,
K
,
N
};
DDim
col_buffer_3d_shape
=
{
groups
,
M
,
N
};
DDim
filter_grad_shape
=
{
groups
,
K
,
M
};
DenseTensor
weight_3d
;
weight_3d
.
ShareDataWith
(
filter
).
Resize
(
weight_3d_shape
);
DenseTensor
out_grad_4d
;
out_grad_4d
.
ShareDataWith
(
output_buffer
).
Resize
(
out_grad_4d_shape
);
DenseTensor
col_buffer_3d
;
col_buffer_3d
.
ShareDataWith
(
col_buffer
).
Resize
(
col_buffer_3d_shape
);
phi
::
funcs
::
SetConstant
<
Context
,
T
>
set_zero
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
Context
,
T
>
(
dev_ctx
);
int
input_dim
=
x
.
numel
()
/
x
.
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
int
input_mask_dim
=
mask
?
mask
->
numel
()
/
mask
->
dims
()[
0
]
:
0
;
if
(
filter_grad
)
{
Full
<
T
>
(
dev_ctx
,
{
filter_grad_shape
.
Get
(),
filter_grad_shape
.
size
()},
0
,
filter_grad
);
}
if
(
dx
)
{
dev_ctx
.
template
Alloc
<
T
>(
dx
);
set_zero
(
dev_ctx
,
dx
,
static_cast
<
T
>
(
0
));
}
if
(
offset_grad
)
{
dev_ctx
.
template
Alloc
<
T
>(
offset_grad
);
set_zero
(
dev_ctx
,
offset_grad
,
static_cast
<
T
>
(
0
));
if
(
mask_grad
)
{
dev_ctx
.
template
Alloc
<
T
>(
mask_grad
);
set_zero
(
dev_ctx
,
mask_grad
,
static_cast
<
T
>
(
0
));
}
}
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
DenseTensor
out_grad_3d
=
out_grad_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_4d
.
dims
(),
1
,
out_grad_4d
.
dims
().
size
()));
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
DenseTensor
weight_3d_slice
=
weight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
weight_3d
.
dims
(),
1
,
weight_3d
.
dims
().
size
()));
DenseTensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
DenseTensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
blas
.
MatMul
(
weight_3d_slice
,
true
,
out_grad_3d_slice
,
false
,
T
(
1.0
),
&
col_buffer_3d_slice
,
T
(
0.0
));
}
col_buffer
.
Resize
(
make_ddim
(
col_buffer_shape_vec
));
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
const
T
*
input_ptr
=
x
.
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
const
T
*
mask_data_ptr
=
mask
?
mask
->
data
<
T
>
()
+
i
*
im2col_step
*
input_mask_dim
:
nullptr
;
if
(
offset_grad
)
{
T
*
offset_grad_ptr
=
offset_grad
->
data
<
T
>
();
T
*
mask_grad_data_ptr
=
mask_grad
?
mask_grad
->
data
<
T
>
()
+
i
*
im2col_step
*
input_mask_dim
:
nullptr
;
// get grad of offset and mask
ModulatedDeformableCol2imCoord
(
dev_ctx
,
col_buffer_ptr
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_data_ptr
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
offset_grad_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_grad_data_ptr
);
}
if
(
dx
)
{
T
*
dx_ptr
=
dx
->
data
<
T
>
();
// get grad of input
ModulatedDeformableCol2im
(
dev_ctx
,
col_buffer_ptr
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_data_ptr
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
dx_ptr
+
i
*
im2col_step
*
input_dim
);
dx
->
Resize
(
x
.
dims
());
}
funcs
::
ModulatedDeformableIm2col
(
dev_ctx
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
mask_data_ptr
,
input_shape_vec
,
col_buffer_shape_vec
,
filter_shape_vec
,
paddings
,
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
col_buffer_3d
.
Resize
(
col_buffer_3d_shape
);
if
(
filter_grad
)
{
DenseTensor
dweight_3d
=
Empty
<
T
>
(
dev_ctx
,
{
filter_grad_shape
.
Get
(),
filter_grad_shape
.
size
()});
for
(
int
g
=
0
;
g
<
groups
;
++
g
)
{
DenseTensor
out_grad_3d_slice
=
out_grad_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
out_grad_3d
.
dims
(),
1
,
out_grad_3d
.
dims
().
size
()));
DenseTensor
col_buffer_3d_slice
=
col_buffer_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
col_buffer_3d
.
dims
(),
1
,
col_buffer_3d
.
dims
().
size
()));
DenseTensor
dweight_3d_slice
=
dweight_3d
.
Slice
(
g
,
g
+
1
).
Resize
(
phi
::
slice_ddim
(
dweight_3d
.
dims
(),
1
,
dweight_3d
.
dims
().
size
()));
blas
.
MatMul
(
out_grad_3d_slice
,
false
,
col_buffer_3d_slice
,
true
,
T
(
1.0
),
&
dweight_3d_slice
,
T
(
0.0
));
}
// update grad of weights
FilterGradAddup
<
T
>
(
dev_ctx
,
dweight_3d
.
numel
(),
groups
,
K
,
M
,
dweight_3d
.
data
<
T
>
(),
filter_grad
->
data
<
T
>
());
}
}
if
(
filter_grad
)
{
filter_grad
->
Resize
(
filter
.
dims
());
}
}
}
// namespace phi
paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
浏览文件 @
31363c3f
...
@@ -18,66 +18,17 @@
...
@@ -18,66 +18,17 @@
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/utils/optional.h"
namespace
phi
{
namespace
phi
{
template
<
typename
T
>
HOSTDEVICE
T
DmcnIm2colBilinear
(
const
T
*
bottom_data
,
const
int
data_width
,
const
int
height
,
const
int
width
,
T
h
,
T
w
)
{
int
h_low
=
floor
(
h
);
int
w_low
=
floor
(
w
);
int
h_high
=
h_low
+
1
;
int
w_high
=
w_low
+
1
;
T
lh
=
h
-
h_low
;
T
lw
=
w
-
w_low
;
T
hh
=
1
-
lh
;
T
hw
=
1
-
lw
;
T
v1
=
(
h_low
>=
0
&&
w_low
>=
0
)
?
bottom_data
[
h_low
*
data_width
+
w_low
]
:
0
;
T
v2
=
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
?
bottom_data
[
h_low
*
data_width
+
w_high
]
:
0
;
T
v3
=
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
?
bottom_data
[
h_high
*
data_width
+
w_low
]
:
0
;
T
v4
=
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
?
bottom_data
[
h_high
*
data_width
+
w_high
]
:
0
;
T
w1
=
hh
*
hw
;
T
w2
=
hh
*
lw
;
T
w3
=
lh
*
hw
;
T
w4
=
lh
*
lw
;
return
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
;
}
template
<
typename
T
,
typename
Context
>
void
ModulatedDeformableIm2col
(
const
Context
&
dev_ctx
,
const
T
*
data_im
,
const
T
*
data_offset
,
const
T
*
data_mask
,
const
std
::
vector
<
int64_t
>&
im_shape
,
const
std
::
vector
<
int64_t
>&
col_shape
,
const
std
::
vector
<
int64_t
>&
filter_shape
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
const
int
deformable_groups
,
T
*
data_col
);
template
<
typename
T
,
typename
Context
>
template
<
typename
T
,
typename
Context
>
void
DeformableConvKernel
(
const
Context
&
dev_ctx
,
void
DeformableConvKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
x
,
const
DenseTensor
&
offset
,
const
DenseTensor
&
offset
,
const
DenseTensor
&
filter
,
const
DenseTensor
&
filter
,
const
DenseTensor
&
mask
,
paddle
::
optional
<
const
DenseTensor
&>
mask
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
dilations
,
...
@@ -125,28 +76,31 @@ void DeformableConvKernel(const Context& dev_ctx,
...
@@ -125,28 +76,31 @@ void DeformableConvKernel(const Context& dev_ctx,
int
input_dim
=
x
.
numel
()
/
x
.
dims
()[
0
];
int
input_dim
=
x
.
numel
()
/
x
.
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
int
input_offset_dim
=
offset
.
numel
()
/
offset
.
dims
()[
0
];
int
input_mask_dim
=
mask
.
numel
()
/
mask
.
dims
()[
0
];
int
input_mask_dim
=
mask
?
mask
->
numel
()
/
mask
->
dims
()[
0
]
:
0
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
Context
,
T
>
(
dev_ctx
);
const
T
*
input_ptr
=
x
.
data
<
T
>
();
const
T
*
input_ptr
=
x
.
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
const
T
*
offset_ptr
=
offset
.
data
<
T
>
();
const
T
*
mask_ptr
=
mask
.
data
<
T
>
()
;
const
T
*
mask_ptr
=
mask
?
mask
->
data
<
T
>
()
:
nullptr
;
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
T
*
col_buffer_ptr
=
col_buffer
.
data
<
T
>
();
auto
blas
=
phi
::
funcs
::
GetBlas
<
Context
,
T
>
(
dev_ctx
);
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
for
(
int
i
=
0
;
i
<
batch_size
/
im2col_step
;
++
i
)
{
ModulatedDeformableIm2col
(
dev_ctx
,
const
T
*
temp_mask_ptr
=
input_ptr
+
i
*
im2col_step
*
input_dim
,
mask_ptr
?
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
:
nullptr
;
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
funcs
::
ModulatedDeformableIm2col
(
mask_ptr
+
i
*
im2col_step
*
input_mask_dim
,
dev_ctx
,
input_shape_vec
,
input_ptr
+
i
*
im2col_step
*
input_dim
,
col_buffer_shape_vec
,
offset_ptr
+
i
*
im2col_step
*
input_offset_dim
,
filter_shape_vec
,
temp_mask_ptr
,
paddings
,
input_shape_vec
,
strides
,
col_buffer_shape_vec
,
dilations
,
filter_shape_vec
,
deformable_groups
,
paddings
,
col_buffer_ptr
);
strides
,
dilations
,
deformable_groups
,
col_buffer_ptr
);
DenseTensor
output_3d
=
output_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
DenseTensor
output_3d
=
output_4d
.
Slice
(
i
,
i
+
1
).
Resize
(
phi
::
slice_ddim
(
output_4d
.
dims
(),
1
,
output_4d
.
dims
().
size
()));
phi
::
slice_ddim
(
output_4d
.
dims
(),
1
,
output_4d
.
dims
().
size
()));
// get the product of pixel and weight
// get the product of pixel and weight
...
...
paddle/phi/ops/compat/deformable_conv_sig.cc
浏览文件 @
31363c3f
...
@@ -29,6 +29,34 @@ KernelSignature DeformableConvOpArgumentMapping(
...
@@ -29,6 +29,34 @@ KernelSignature DeformableConvOpArgumentMapping(
{
"Output"
});
{
"Output"
});
}
}
KernelSignature
DeformableConvGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"deformable_conv_grad"
,
{
"Input"
,
"Offset"
,
"Filter"
,
"Mask"
,
GradVarName
(
"Output"
)},
{
"strides"
,
"paddings"
,
"dilations"
,
"deformable_groups"
,
"groups"
,
"im2col_step"
},
{
GradVarName
(
"Input"
),
GradVarName
(
"Offset"
),
GradVarName
(
"Filter"
),
GradVarName
(
"Mask"
)});
}
}
// namespace phi
}
// namespace phi
PD_REGISTER_BASE_KERNEL_NAME
(
deformable_conv_v1
,
deformable_conv
);
PD_REGISTER_BASE_KERNEL_NAME
(
deformable_conv_v1_grad
,
deformable_conv_grad
);
PD_REGISTER_ARG_MAPPING_FN
(
deformable_conv
,
PD_REGISTER_ARG_MAPPING_FN
(
deformable_conv
,
phi
::
DeformableConvOpArgumentMapping
);
phi
::
DeformableConvOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
deformable_conv_grad
,
phi
::
DeformableConvGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
deformable_conv_v1
,
phi
::
DeformableConvOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
deformable_conv_v1_grad
,
phi
::
DeformableConvGradOpArgumentMapping
);
python/paddle/distributed/auto_parallel/dist_loader.py
浏览文件 @
31363c3f
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
import
abc
import
abc
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
from
.utils
import
to_list
from
paddle.io
import
DataLoader
,
DistributedBatchSampler
from
paddle.io
import
DataLoader
,
DistributedBatchSampler
...
@@ -51,10 +52,11 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
...
@@ -51,10 +52,11 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
places
,
places
,
batch_size
=
1
,
batch_size
=
1
,
epochs
=
1
,
epochs
=
1
,
steps_per_epoch
=
1000
,
steps_per_epoch
=
None
,
data_parallel_world_size
=
None
,
data_parallel_world_size
=
None
,
data_parallel_rank
=
None
,
data_parallel_rank
=
None
,
drop_last
=
False
):
drop_last
=
False
,
inputs
=
[]):
self
.
feed_list
=
feed_list
self
.
feed_list
=
feed_list
self
.
places
=
places
self
.
places
=
places
self
.
steps_per_epoch
=
steps_per_epoch
self
.
steps_per_epoch
=
steps_per_epoch
...
@@ -62,6 +64,8 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
...
@@ -62,6 +64,8 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
dataset
,
batch_size
,
epochs
,
data_parallel_world_size
,
dataset
,
batch_size
,
epochs
,
data_parallel_world_size
,
data_parallel_rank
,
drop_last
)
data_parallel_rank
,
drop_last
)
self
.
_inner_dataloader
=
self
.
_create_inner_dataloader
()
self
.
_inner_dataloader
=
self
.
_create_inner_dataloader
()
self
.
_steps
=
self
.
_infer_steps
()
self
.
_inputs
=
inputs
def
__iter__
(
self
):
def
__iter__
(
self
):
self
.
_cur_step
=
0
self
.
_cur_step
=
0
...
@@ -69,22 +73,38 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
...
@@ -69,22 +73,38 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
return
self
return
self
def
__next__
(
self
):
def
__next__
(
self
):
if
self
.
_cur_step
<
self
.
steps_per_epoch
:
if
self
.
_cur_step
<
self
.
_steps
:
self
.
_cur_step
+=
1
self
.
_cur_step
+=
1
else
:
else
:
self
.
_inner_dataloader
.
reset
()
self
.
_inner_dataloader
.
reset
()
raise
StopIteration
raise
StopIteration
def
_infer_steps
(
self
):
if
self
.
steps_per_epoch
is
not
None
:
return
self
.
steps_per_epoch
try
:
steps_per_epoch
=
len
(
self
.
dataset
)
//
self
.
batch_size
except
:
raise
ValueError
(
"Pleace set `steps_per_epoch` or implement `__len__` methond in dataset class."
)
return
steps_per_epoch
def
_create_inner_dataloader
(
self
):
def
_create_inner_dataloader
(
self
):
def
data_generator
():
def
data_generator
():
batch_data
=
None
batch_data
=
None
for
step
,
data
in
enumerate
(
self
.
dataset
):
for
step
,
data
in
enumerate
(
self
.
dataset
):
if
not
isinstance
(
data
,
list
):
data
=
to_list
(
data
)
if
batch_data
is
None
:
if
batch_data
is
None
:
batch_data
=
[[]
for
i
in
range
(
len
(
data
))]
batch_data
=
[[]
for
i
in
range
(
len
(
data
))]
for
idx
,
data_item
in
enumerate
(
data
):
batch_data
[
idx
].
append
(
np
.
array
(
data_item
))
for
idx
in
range
(
len
(
data
)):
batch_data
[
idx
].
append
(
data
[
idx
])
if
(
step
+
1
)
%
self
.
batch_size
==
0
:
if
(
step
+
1
)
%
self
.
batch_size
==
0
:
yield
batch_data
[
0
],
batch_data
[
1
]
yield
batch_data
batch_data
=
None
batch_data
=
None
dataloader
=
paddle
.
fluid
.
io
.
DataLoader
.
from_generator
(
dataloader
=
paddle
.
fluid
.
io
.
DataLoader
.
from_generator
(
...
...
python/paddle/distributed/auto_parallel/dist_saver.py
0 → 100644
浏览文件 @
31363c3f
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import
re
import
os
import
errno
import
pickle
import
warnings
import
logging
import
numpy
as
np
import
paddle
from
paddle
import
fluid
from
paddle.fluid
import
core
from
paddle.fluid.framework
import
static_only
from
.utils
import
get_dist_attr
from
.converter
import
Converter
from
.process_group
import
_g_process_group_map
from
..utils
import
get_logger
def
check_filename
(
re_exp
,
filename
):
if
re
.
search
(
re_exp
,
filename
):
return
True
else
:
return
False
def
_process_path
(
path
):
filename
=
os
.
path
.
basename
(
path
)
if
filename
==
""
:
raise
ValueError
(
"path should be of 'dirname/filename' format, but received filename is empty string"
)
try
:
dirname
=
os
.
path
.
dirname
(
path
)
os
.
makedirs
(
dirname
)
except
OSError
as
e
:
if
e
.
errno
!=
errno
.
EEXIST
:
raise
return
dirname
,
filename
class
DistributedSaver
:
def
__init__
(
self
):
self
.
_logger
=
get_logger
(
logging
.
INFO
)
def
save
(
self
,
path
,
serial_program
,
dist_main_program
,
dist_context
):
dirname
,
filename
=
_process_path
(
path
)
rank_id
=
paddle
.
distributed
.
get_rank
()
# save serial program when rank id is 0
if
rank_id
==
0
:
self
.
_save_rank_mapping
(
dirname
)
serial_model_filename
=
filename
+
"_serial.pdmodel"
serial_model_path
=
os
.
path
.
join
(
dirname
,
serial_model_filename
)
with
open
(
serial_model_path
,
"wb"
)
as
f
:
f
.
write
(
serial_program
.
desc
.
serialize_to_string
())
# save distributed main program
dist_model_filename
=
filename
+
"_dist"
+
str
(
rank_id
)
+
".pdmodel"
dist_model_path
=
os
.
path
.
join
(
dirname
,
dist_model_filename
)
with
open
(
dist_model_path
,
"wb"
)
as
f
:
f
.
write
(
dist_main_program
.
desc
.
serialize_to_string
())
# save distributed params
dist_param_filename
=
filename
+
"_dist"
+
str
(
rank_id
)
+
".pdparams"
dist_param_path
=
os
.
path
.
join
(
dirname
,
dist_param_filename
)
dist_param
=
{
k
:
np
.
array
(
v
)
for
k
,
v
in
dist_main_program
.
state_dict
().
items
()
}
with
open
(
dist_param_path
,
"wb"
)
as
f
:
pickle
.
dump
(
dist_param
,
f
)
# save distributed attribute
dist_attr_filename
=
filename
+
"_dist"
+
str
(
rank_id
)
+
".pdattr"
dist_attr_path
=
os
.
path
.
join
(
dirname
,
dist_attr_filename
)
dist_attrs
=
get_dist_attr
(
dist_main_program
,
dist_context
)
with
open
(
dist_attr_path
,
"wb"
)
as
f
:
pickle
.
dump
(
dist_attrs
,
f
)
# TODO:save cluster.json
def
load
(
self
,
path
,
program
,
dist_context
,
strict
=
True
,
load_optimizer
=
True
):
# TODO: if `program` is None, load `path.pdmodel`.
filename
=
os
.
path
.
basename
(
path
)
if
filename
==
""
:
raise
ValueError
(
"path should be of 'dirname/filename' format, but received filename is empty string"
)
dirname
=
os
.
path
.
dirname
(
path
)
# load path.pdparam
param_file_list
=
[]
for
param_file
in
os
.
listdir
(
dirname
):
if
check_filename
(
'{}(.*)_dist(.*).pdparams'
.
format
(
filename
),
param_file
):
param_file_list
.
append
(
os
.
path
.
join
(
dirname
,
param_file
))
param_file_list
.
sort
()
self
.
_logger
.
info
(
"Load distributed attribute file: {}"
.
format
(
param_file_list
))
param_dict
=
{}
for
param_file
in
param_file_list
:
with
open
(
param_file
,
'rb'
)
as
f
:
state_dict_info
=
pickle
.
load
(
f
,
encoding
=
'latin1'
)
for
name
,
value
in
state_dict_info
.
items
():
if
name
in
param_dict
:
param_dict
[
name
].
append
(
np
.
array
(
value
))
else
:
param_dict
[
name
]
=
[
np
.
array
(
value
)]
# load path.pdattr
dist_attr_file_list
=
[]
for
dist_attr_file
in
os
.
listdir
(
dirname
):
if
check_filename
(
'{}(.*)_dist(.*).pdattr'
.
format
(
filename
),
dist_attr_file
):
dist_attr_file_list
.
append
(
os
.
path
.
join
(
dirname
,
dist_attr_file
))
dist_attr_file_list
.
sort
()
self
.
_logger
.
info
(
"Load distributed attribute file: {}"
.
format
(
dist_attr_file_list
))
pre_dist_attr
=
{}
for
dist_attr_file
in
dist_attr_file_list
:
with
open
(
dist_attr_file
,
'rb'
)
as
f
:
dist_attr
=
pickle
.
load
(
f
,
encoding
=
'latin1'
)
for
name
,
attr
in
dist_attr
.
items
():
if
name
not
in
pre_dist_attr
:
pre_dist_attr
[
name
]
=
attr
# get current dist_attr
cur_dist_attr
=
get_dist_attr
(
program
,
dist_context
)
# param convert
converter
=
Converter
(
param_dict
,
pre_dist_attr
,
cur_dist_attr
)
param_dict
=
converter
.
convert
(
strict
=
strict
)
program
.
set_state_dict
(
param_dict
)
def
save_inference_model
(
self
,
path
,
feed_vars
,
fetch_vars
,
exe
,
**
kwargs
):
dirname
,
filename
=
_process_path
(
path
)
# save distributed inference program
rank_id
=
paddle
.
distributed
.
get_rank
()
if
rank_id
==
0
:
self
.
_save_rank_mapping
(
dirname
)
op_role_key
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
op_role_forward
=
int
(
core
.
op_proto_and_checker_maker
.
OpRole
.
Forward
)
dist_main_prog
=
kwargs
.
get
(
'program'
,
None
)
if
not
dist_main_prog
:
dist_main_prog
=
fluid
.
default_main_program
()
global_block
=
dist_main_prog
.
global_block
()
ops
=
global_block
.
ops
feed_vars_names
=
list
(
map
(
lambda
x
:
x
.
name
,
feed_vars
))
fetch_vars_names
=
list
(
map
(
lambda
x
:
x
.
name
,
fetch_vars
))
last_idx
=
-
1
for
idx
,
op
in
enumerate
(
ops
):
if
op
.
attr
(
op_role_key
)
!=
op_role_forward
:
continue
if
op
.
type
==
"read"
or
op
.
type
==
"feed"
or
op
.
type
==
'recv_v2'
:
feed_vars_names
+=
op
.
output
(
"Out"
)
if
op
.
type
==
"send_v2"
:
fetch_vars_names
+=
op
.
input
(
"X"
)
last_idx
=
max
(
idx
,
last_idx
)
for
out_name
in
op
.
output_arg_names
:
if
out_name
in
fetch_vars_names
:
last_idx
=
max
(
idx
,
last_idx
)
used_inputs
=
[]
used_outputs
=
[]
for
idx
,
op
in
enumerate
(
ops
):
if
idx
>
last_idx
:
break
used_inputs
+=
op
.
input_arg_names
used_outputs
+=
op
.
output_arg_names
dist_feed_vars_names
=
list
(
set
(
feed_vars_names
)
&
set
(
used_inputs
))
dist_fetch_vars_names
=
list
(
set
(
fetch_vars_names
)
&
set
(
used_outputs
))
dist_feed_vars
=
[
global_block
.
vars
[
name
]
for
name
in
dist_feed_vars_names
]
dist_fetch_vars
=
[
global_block
.
vars
[
name
]
for
name
in
dist_fetch_vars_names
]
# NOTE: `paddle.static.save_inference_model` does not support subblock.
dist_filename
=
filename
+
"_dist"
+
str
(
rank_id
)
dist_path
=
os
.
path
.
join
(
dirname
,
dist_filename
)
paddle
.
static
.
save_inference_model
(
dist_path
,
dist_feed_vars
,
dist_fetch_vars
,
exe
,
program
=
dist_main_prog
)
def
_save_rank_mapping
(
self
,
dirname
):
path
=
os
.
path
.
join
(
dirname
,
'rank_mapping.csv'
)
f
=
open
(
path
,
'w'
)
f
.
write
(
'[ring_id -> ranks]
\n
'
)
for
process_group
in
_g_process_group_map
.
values
():
ring_id
=
process_group
.
_group_id
ranks
=
[
str
(
rank
)
for
rank
in
process_group
.
_ranks
]
id_to_rank
=
str
(
ring_id
)
+
","
+
","
.
join
(
ranks
)
+
'
\n
'
f
.
write
(
id_to_rank
)
id_to_rank
=
""
f
.
write
(
'[rank -> ring_ids]
\n
'
)
rank_to_id_dict
=
{}
for
process_group
in
_g_process_group_map
.
values
():
ring_id
=
process_group
.
_group_id
for
rank
in
process_group
.
_ranks
:
if
rank
in
rank_to_id_dict
:
rank_to_id_dict
[
rank
].
append
(
str
(
ring_id
))
else
:
rank_to_id_dict
[
rank
]
=
[
str
(
ring_id
)]
rank_to_id
=
""
for
item
,
val
in
rank_to_id_dict
.
items
():
rank_to_id
+=
str
(
item
)
+
","
rank_to_id
+=
","
.
join
(
val
)
+
"
\n
"
f
.
write
(
rank_to_id
)
rank_to_id
=
""
f
.
close
()
python/paddle/distributed/auto_parallel/engine.py
浏览文件 @
31363c3f
...
@@ -19,138 +19,158 @@ from collections import defaultdict
...
@@ -19,138 +19,158 @@ from collections import defaultdict
import
paddle
import
paddle
from
paddle
import
fluid
from
paddle
import
fluid
from
paddle.io
import
Dataset
from
paddle.io
import
Dataset
from
paddle.fluid.backward
import
append_backward
from
paddle.metric
import
Metric
import
paddle.fluid.core
as
core
from
paddle.static
import
InputSpec
from
paddle.static
import
InputSpec
from
paddle.fluid
import
core
from
paddle.fluid
import
program_guard
from
paddle.fluid
import
program_guard
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.framework
import
Operator
from
paddle.fluid.framework
import
Operator
from
paddle.fluid.framework
import
_current_expected_place
as
_get_device
from
paddle.fluid.framework
import
_current_expected_place
as
_get_device
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
from
paddle.distributed.passes
import
new_pass
,
PassContext
from
paddle.distributed.passes
import
new_pass
,
PassContext
from
paddle.distributed.utils
import
get_logger
from
paddle.distributed.utils
import
get_logger
from
.dist_loader
import
NonIterableGeneratorLoader
from
.dist_op
import
DistributedOperator
from
.dist_tensor
import
DistributedTensor
from
.dist_context
import
DistributedContext
from
.dist_context
import
get_default_distributed_context
from
.dist_context
import
set_default_distributed_context
from
.process_group
import
get_all_process_groups
from
.process_group
import
get_process_group
from
.process_group
import
get_world_process_group
from
.process_group
import
_g_process_group_map
,
ProcessGroup
from
.completion
import
Completer
from
.partitioner
import
Partitioner
from
.reshard
import
reshard
,
HAS_SENT
,
HAS_RECV
,
HAS_ALLGATHER
from
.cluster
import
Cluster
from
.mapper
import
mapping
from
.mapper
import
mapping
from
.cluster
import
Cluster
from
.reshard
import
reshard
from
.planner
import
Planner
from
.planner
import
Planner
from
.utils
import
make_data_unshard
from
.completion
import
Completer
from
.utils
import
set_grad_var_shape
from
.partitioner
import
Partitioner
from
.utils
import
print_program_with_dist_attr
from
.dist_op
import
DistributedOperator
from
.utils
import
SerialProgramInfo
from
.dist_saver
import
DistributedSaver
from
.dist_loader
import
NonIterableGeneratorLoader
from
.utils
import
make_data_unshard
,
set_grad_var_shape
from
.utils
import
print_program_with_dist_attr
,
to_list
from
.process_group
import
get_all_process_groups
,
get_world_process_group
from
.dist_context
import
DistributedContext
,
get_default_distributed_context
paddle
.
enable_static
()
paddle
.
enable_static
()
def
to_list
(
value
):
if
value
is
None
:
return
value
if
isinstance
(
value
,
(
list
,
tuple
)):
return
list
(
value
)
return
[
value
]
class
Engine
:
class
Engine
:
def
__init__
(
self
,
model
=
None
,
data_spec
=
None
,
cluster
=
None
,
strategy
=
None
):
def
__init__
(
self
,
model
=
None
,
inputs_spec
=
None
,
labels_spec
=
None
,
cluster
=
None
,
strategy
=
None
):
self
.
model
=
model
self
.
model
=
model
self
.
data_spec
=
data_spec
self
.
inputs_spec
=
self
.
_validate_spec
(
inputs_spec
)
self
.
labels_spec
=
self
.
_validate_spec
(
labels_spec
)
self
.
cluster
=
cluster
self
.
cluster
=
cluster
self
.
strategy
=
strategy
self
.
strategy
=
strategy
self
.
_executor
=
None
self
.
_executor
=
None
self
.
_orig_main_prog
=
fluid
.
default_main_program
()
self
.
_orig_main_prog
=
fluid
.
default_main_program
()
self
.
_orig_startup_prog
=
fluid
.
default_startup_program
()
self
.
_orig_startup_prog
=
fluid
.
default_startup_program
()
self
.
_orig_dist_context
=
get_default_distributed_context
()
self
.
_serial_main_progs
=
{}
self
.
_serial_main_progs
=
{}
self
.
_serial_startup_progs
=
{}
self
.
_serial_startup_progs
=
{}
self
.
_dist_main_progs
=
defaultdict
(
dict
)
self
.
_dist_main_progs
=
defaultdict
(
dict
)
# dist main programs
self
.
_dist_startup_progs
=
defaultdict
(
dict
)
self
.
_dist_startup_progs
=
defaultdict
(
dict
)
# dist startup programs
self
.
_orig_dist_context
=
get_default_distributed_context
()
self
.
_dist_contexts
=
{}
self
.
_dist_contexts
=
{}
self
.
_pass_contexts
=
{}
self
.
_pass_contexts
=
{}
self
.
_cur_rank
=
paddle
.
distributed
.
get_rank
()
self
.
_cur_rank
=
paddle
.
distributed
.
get_rank
()
self
.
_logger
=
get_logger
(
logging
.
INFO
)
self
.
_logger
=
get_logger
(
logging
.
INFO
)
self
.
_saver
=
DistributedSaver
()
self
.
_feed_vars
=
{}
self
.
_fetch_vars
=
{}
def
prepare
(
self
,
def
prepare
(
self
,
optimizer
=
None
,
optimizer
=
None
,
loss
=
None
,
loss
=
None
,
metrics
=
None
,
metrics
=
None
,
mode
=
"train"
,
mode
=
'train'
,
all_ranks
=
False
):
all_ranks
=
False
):
self
.
optimizer
=
optimizer
self
.
_optimizer
=
optimizer
self
.
loss
=
loss
# TODO: check loss type
self
.
metrics
=
metrics
self
.
_loss
=
loss
self
.
_metrics
=
to_list
(
metrics
)
for
m
in
[
'train'
,
'predict'
]:
self
.
mode
=
m
self
.
_build
(
m
)
# build forward program
self
.
_plan
(
m
)
# completion & planner
self
.
_parallel
(
m
,
all_ranks
)
# parallel
self
.
_initialize
(
m
)
# init comm and startup program
self
.
mode
=
mode
self
.
mode
=
mode
self
.
_build
()
self
.
_plan
()
if
not
all_ranks
:
self
.
_parallel
(
self
.
_cur_rank
)
else
:
world_process_group
=
get_world_process_group
()
all_ranks
=
world_process_group
.
ranks
for
rank
in
all_ranks
:
self
.
_parallel
(
rank
)
self
.
_place
=
_get_device
()
if
isinstance
(
self
.
_place
,
fluid
.
CUDAPlace
):
self
.
_place
=
fluid
.
CUDAPlace
(
ParallelEnv
().
dev_id
)
if
self
.
_executor
is
None
:
self
.
_executor
=
paddle
.
static
.
Executor
(
self
.
_place
)
def
_build
(
self
):
def
_build
(
self
,
mode
):
serial_main_prog
=
self
.
_serial_main_progs
.
get
(
self
.
mode
,
None
)
serial_main_prog
=
self
.
_serial_main_progs
.
get
(
mode
,
None
)
if
serial_main_prog
is
not
None
:
if
serial_main_prog
is
not
None
:
return
return
losses
=
[]
metrics
=
[]
serial_main_prog
=
self
.
_orig_main_prog
.
clone
()
serial_main_prog
=
self
.
_orig_main_prog
.
clone
()
serial_startup_prog
=
self
.
_orig_startup_prog
.
clone
()
serial_startup_prog
=
self
.
_orig_startup_prog
.
clone
()
with
fluid
.
program_guard
(
serial_main_prog
,
serial_startup_prog
):
with
fluid
.
program_guard
(
serial_main_prog
,
serial_startup_prog
):
inputs_spec
=
self
.
data_spec
[
0
]
inputs_spec
=
self
.
inputs_spec
labels_spec
=
self
.
data_spec
[
1
]
labels_spec
=
self
.
labels_spec
if
self
.
labels_spec
else
[]
inputs
=
[
s
.
_create_feed_layer
()
for
s
in
to_list
(
inputs_spec
)]
inputs
=
[
s
.
_create_feed_layer
()
for
s
in
inputs_spec
]
labels
=
[
s
.
_create_feed_layer
()
for
s
in
to_list
(
labels_spec
)]
labels
=
[
s
.
_create_feed_layer
()
for
s
in
labels_spec
]
self
.
_input_vars
=
inputs
self
.
_label_vars
=
labels
self
.
_feed_vars
=
self
.
_input_vars
+
self
.
_label_vars
outputs
=
to_list
(
self
.
model
(
*
inputs
))
outputs
=
to_list
(
self
.
model
(
*
inputs
))
if
self
.
mode
!=
"predict"
and
self
.
loss
:
if
mode
!=
"predict"
and
self
.
_loss
:
loss
=
self
.
loss
(
*
(
outputs
+
labels
))
losses
=
to_list
(
self
.
_loss
(
*
(
outputs
+
labels
)))
self
.
_loss_var
=
loss
self
.
_feed_vars
[
mode
]
=
{
"inputs"
:
inputs
,
"labels"
:
labels
}
self
.
_fetch_vars
=
{
"outputs"
:
outputs
,
"loss"
:
loss
}
self
.
_serial_main_progs
[
self
.
mode
]
=
serial_main_prog
self
.
_fetch_vars
[
mode
]
=
{
self
.
_serial_startup_progs
[
self
.
mode
]
=
serial_startup_prog
"outputs"
:
outputs
,
self
.
_dist_contexts
[
self
.
mode
]
=
DistributedContext
(
"loss"
:
losses
,
serial_main_prog
,
serial_startup_prog
,
"metrics"
:
metrics
self
.
_dist_main_progs
[
self
.
mode
],
}
self
.
_dist_startup_progs
[
self
.
mode
])
self
.
_pass_contexts
[
self
.
mode
]
=
PassContext
()
self
.
_serial_main_progs
[
mode
]
=
serial_main_prog
self
.
_serial_startup_progs
[
mode
]
=
serial_startup_prog
def
_plan
(
self
):
self
.
_dist_contexts
[
mode
]
=
DistributedContext
(
serial_main_prog
,
serial_startup_prog
,
self
.
_dist_main_progs
[
mode
],
self
.
_dist_startup_progs
[
mode
])
self
.
_pass_contexts
[
mode
]
=
PassContext
()
def
_plan
(
self
,
mode
):
# Complete the distributed annotation
# Complete the distributed annotation
serial_main_prog
=
self
.
_serial_main_progs
[
self
.
mode
]
serial_main_prog
=
self
.
_serial_main_progs
[
mode
]
self
.
_completer
=
Completer
(
self
.
_dist_contexts
[
self
.
mode
])
self
.
_completer
=
Completer
(
self
.
_dist_contexts
[
mode
])
self
.
_completer
.
complete_forward_annotation
(
serial_main_prog
)
self
.
_completer
.
complete_forward_annotation
(
serial_main_prog
)
# TODO: add auto planner process
# TODO: add auto planner process
# parse forward sub block
# parse forward sub block
self
.
_dist_contexts
[
self
.
mode
].
block_state
.
parse_forward_blocks
(
self
.
_dist_contexts
[
mode
].
block_state
.
parse_forward_blocks
(
serial_main_prog
)
serial_main_prog
)
def
_parallel
(
self
,
rank
):
def
_parallel
(
self
,
mode
,
all_ranks
=
False
):
serial_main_program
=
self
.
_serial_main_progs
[
self
.
mode
]
if
not
all_ranks
:
serial_startup_program
=
self
.
_serial_startup_progs
[
self
.
mode
]
self
.
_parallel_program
(
mode
,
self
.
_cur_rank
)
dist_context
=
self
.
_dist_contexts
[
self
.
mode
]
else
:
if
self
.
mode
!=
"predict"
and
self
.
loss
:
world_process_group
=
get_world_process_group
()
all_ranks
=
world_process_group
.
ranks
for
rank
in
all_ranks
:
self
.
_parallel_program
(
mode
,
rank
)
def
_initialize
(
self
,
mode
):
# Traverse different rank programs and traverse each op of them,
# instantiate communication by process_mapping.
all_process_groups
=
get_all_process_groups
()
for
process_group
in
all_process_groups
:
if
self
.
_cur_rank
not
in
process_group
.
ranks
:
continue
process_group
.
instantiate
()
# initialize
self
.
_place
=
_get_device
()
if
isinstance
(
self
.
_place
,
fluid
.
CUDAPlace
):
self
.
_place
=
fluid
.
CUDAPlace
(
ParallelEnv
().
dev_id
)
if
self
.
_executor
is
None
:
self
.
_executor
=
paddle
.
static
.
Executor
(
self
.
_place
)
dist_startup_prog
=
self
.
_dist_startup_progs
[
mode
][
self
.
_cur_rank
]
self
.
_executor
.
run
(
dist_startup_prog
)
def
_parallel_program
(
self
,
mode
,
rank
):
serial_main_program
=
self
.
_serial_main_progs
[
mode
]
serial_startup_program
=
self
.
_serial_startup_progs
[
mode
]
dist_context
=
self
.
_dist_contexts
[
mode
]
if
mode
==
"train"
and
self
.
_optimizer
:
# Generate backward
# Generate backward
serial_loss
=
self
.
_
loss_var
serial_loss
=
self
.
_
fetch_vars
[
mode
][
"loss"
][
0
]
params_grads
=
self
.
_generate_backward
(
params_grads
=
self
.
_generate_backward
(
serial_main_program
,
serial_startup_program
,
serial_loss
)
serial_main_program
,
serial_startup_program
,
serial_loss
)
# Apply pre optimization passes
# Apply pre optimization passes
...
@@ -172,8 +192,23 @@ class Engine:
...
@@ -172,8 +192,23 @@ class Engine:
# Apply post optimization passes
# Apply post optimization passes
self
.
_apply_post_optimization
(
dist_main_prog
,
dist_startup_prog
,
self
.
_apply_post_optimization
(
dist_main_prog
,
dist_startup_prog
,
rank
,
dist_params_grads
)
rank
,
dist_params_grads
)
self
.
_dist_main_progs
[
self
.
mode
][
rank
]
=
dist_main_prog
else
:
self
.
_dist_startup_progs
[
self
.
mode
][
rank
]
=
dist_startup_prog
# Do logical partition
partitioner
=
Partitioner
(
dist_context
,
rank
)
dist_main_prog
,
dist_startup_prog
,
dist_params_grads
=
partitioner
.
partition
(
serial_main_program
,
serial_startup_program
,
[])
# Do reshard process
make_data_unshard
(
dist_main_prog
,
dist_startup_prog
,
dist_context
)
reshard
(
dist_main_prog
,
dist_startup_prog
,
rank
,
dist_context
,
[],
1
)
# clone program for test
if
mode
!=
'train'
:
dist_main_prog
=
dist_main_prog
.
clone
(
for_test
=
True
)
dist_startup_prog
=
dist_startup_prog
.
clone
(
for_test
=
True
)
self
.
_dist_main_progs
[
mode
][
rank
]
=
dist_main_prog
self
.
_dist_startup_progs
[
mode
][
rank
]
=
dist_startup_prog
def
_generate_backward
(
self
,
main_program
,
startup_program
,
loss
):
def
_generate_backward
(
self
,
main_program
,
startup_program
,
loss
):
with
program_guard
(
main_program
,
startup_program
):
with
program_guard
(
main_program
,
startup_program
):
...
@@ -187,7 +222,7 @@ class Engine:
...
@@ -187,7 +222,7 @@ class Engine:
def
_generate_optimizer
(
self
,
main_program
,
startup_program
,
params_grads
):
def
_generate_optimizer
(
self
,
main_program
,
startup_program
,
params_grads
):
with
program_guard
(
main_program
,
startup_program
):
with
program_guard
(
main_program
,
startup_program
):
optimizer_ops
=
copy
.
deepcopy
(
self
.
optimizer
).
apply_gradients
(
optimizer_ops
=
copy
.
deepcopy
(
self
.
_
optimizer
).
apply_gradients
(
params_grads
)
params_grads
)
self
.
_completer
.
complete_update_annotation
(
main_program
)
self
.
_completer
.
complete_update_annotation
(
main_program
)
return
optimizer_ops
return
optimizer_ops
...
@@ -239,42 +274,87 @@ class Engine:
...
@@ -239,42 +274,87 @@ class Engine:
[
main_program
],
[
startup_program
],
[
main_program
],
[
startup_program
],
self
.
_pass_contexts
[
self
.
mode
])
self
.
_pass_contexts
[
self
.
mode
])
def
fit
(
self
,
train_data
,
batch_size
=
1
,
epochs
=
1
,
steps_per_epoch
=
1000
):
def
fit
(
self
,
train_data
,
batch_size
=
1
,
epochs
=
1
,
steps_per_epoch
=
None
):
# TODO: callbacks
# TODO: evaluate after training
self
.
mode
=
'train'
assert
isinstance
(
train_data
,
Dataset
)
assert
isinstance
(
train_data
,
Dataset
)
assert
steps_per_epoch
is
not
None
train_dataloader
=
self
.
_create_dataloader
(
train_data
,
batch_size
,
train_dataloader
=
self
.
_create_dataloader
(
train_data
,
batch_size
,
epochs
,
steps_per_epoch
)
epochs
,
steps_per_epoch
)
self
.
_init_communication
()
dist_startup_prog
=
self
.
_dist_startup_progs
[
"train"
][
self
.
_cur_rank
]
outputs
=
[]
self
.
_executor
.
run
(
dist_startup_prog
)
for
epoch
in
range
(
epochs
):
for
epoch
in
range
(
epochs
):
# train_dataloader.start()
# for step in range(steps_per_epoch):
# logs = self.train_step(None)
# self._logger.info(logs)
# train_dataloader.reset()
for
step
,
data
in
enumerate
(
train_dataloader
):
for
step
,
data
in
enumerate
(
train_dataloader
):
logs
=
self
.
_train_step
(
data
)
logs
,
loss
=
self
.
_train_step
(
data
)
outputs
.
append
(
loss
)
train_logs
=
{
train_logs
=
{
"train_"
+
name
:
val
"train_"
+
name
:
val
for
name
,
val
in
logs
.
items
()
for
name
,
val
in
logs
.
items
()
}
}
self
.
_logger
.
info
(
train_logs
)
self
.
_logger
.
info
(
train_logs
)
return
outputs
def
predict
(
self
,
test_data
,
batch_size
=
1
,
use_program_cache
=
False
,
return_numpy
=
True
):
self
.
mode
=
'predict'
# TODO: need check dataset
test_dataloader
=
self
.
_create_dataloader
(
test_data
,
batch_size
)
outputs
=
[]
for
step
,
data
in
enumerate
(
test_dataloader
):
logs
,
outs
=
self
.
_predict_step
(
data
,
use_program_cache
,
return_numpy
)
outputs
.
append
(
outs
)
predict_logs
=
{
"predict_"
+
name
:
val
for
name
,
val
in
logs
.
items
()
}
self
.
_logger
.
info
(
predict_logs
)
return
outputs
def
_train_step
(
self
,
data
):
def
_train_step
(
self
,
data
):
logs
=
{}
logs
=
{}
dist_main_prog
=
self
.
_dist_main_progs
[
"train"
][
self
.
_cur_rank
]
dist_main_prog
=
self
.
_dist_main_progs
[
self
.
mode
][
self
.
_cur_rank
]
if
self
.
_loss_var
.
name
not
in
dist_main_prog
.
global_block
().
vars
:
fetch_var
=
self
.
_fetch_vars
[
self
.
mode
][
"loss"
][
0
]
if
fetch_var
.
name
not
in
dist_main_prog
.
global_block
().
vars
:
loss
=
self
.
_executor
.
run
(
dist_main_prog
)
loss
=
self
.
_executor
.
run
(
dist_main_prog
)
logs
[
"loss"
]
=
None
logs
[
"loss"
]
=
None
else
:
else
:
fetch_list
=
self
.
_loss_var
loss
=
self
.
_executor
.
run
(
dist_main_prog
,
loss
=
self
.
_executor
.
run
(
dist_main_prog
,
fetch_list
=
fetch_list
)
fetch_list
=
to_list
(
fetch_var
)
)
logs
[
"loss"
]
=
loss
logs
[
"loss"
]
=
loss
return
logs
return
logs
,
loss
def
_predict_step
(
self
,
data
,
use_program_cache
=
False
,
return_numpy
=
True
):
logs
=
{}
dist_main_prog
=
self
.
_dist_main_progs
[
self
.
mode
][
self
.
_cur_rank
]
fetch_var
=
[]
for
var
in
self
.
_fetch_vars
[
self
.
mode
][
"outputs"
]:
if
var
.
name
in
dist_main_prog
.
global_block
().
vars
:
fetch_var
.
append
(
var
)
if
fetch_var
is
[]:
outs
=
self
.
_executor
.
run
(
dist_main_prog
,
use_program_cache
=
use_program_cache
)
logs
[
"pred"
]
=
outs
else
:
outs
=
self
.
_executor
.
run
(
dist_main_prog
,
fetch_list
=
fetch_var
,
use_program_cache
=
use_program_cache
,
return_numpy
=
return_numpy
)
logs
[
"pred"
]
=
outs
return
logs
,
outs
def
_create_dataloader
(
self
,
dataset
,
batch_size
,
epochs
,
steps_per_epoch
):
def
_create_dataloader
(
self
,
feed_list
=
self
.
_input_vars
+
self
.
_label_vars
dataset
,
batch_size
,
epochs
=
1
,
steps_per_epoch
=
None
):
feed_list
=
self
.
_feed_vars
[
self
.
mode
][
"inputs"
]
+
self
.
_feed_vars
[
self
.
mode
][
"labels"
]
dist_main_prog
=
self
.
_dist_main_progs
[
self
.
mode
][
self
.
_cur_rank
]
dist_main_prog
=
self
.
_dist_main_progs
[
self
.
mode
][
self
.
_cur_rank
]
dist_startup_prog
=
self
.
_dist_startup_progs
[
self
.
mode
][
self
.
_cur_rank
]
dist_startup_prog
=
self
.
_dist_startup_progs
[
self
.
mode
][
self
.
_cur_rank
]
dist_context
=
self
.
_dist_contexts
[
self
.
mode
]
dist_context
=
self
.
_dist_contexts
[
self
.
mode
]
...
@@ -284,8 +364,15 @@ class Engine:
...
@@ -284,8 +364,15 @@ class Engine:
op_size
=
len
(
dist_main_block
.
ops
)
op_size
=
len
(
dist_main_block
.
ops
)
places
=
paddle
.
static
.
cuda_places
()
places
=
paddle
.
static
.
cuda_places
()
with
fluid
.
program_guard
(
dist_main_prog
,
dist_startup_prog
):
with
fluid
.
program_guard
(
dist_main_prog
,
dist_startup_prog
):
inputs
=
self
.
_feed_vars
[
self
.
mode
][
"inputs"
]
dataloader
=
NonIterableGeneratorLoader
(
dataloader
=
NonIterableGeneratorLoader
(
dataset
,
feed_list
,
places
,
batch_size
,
epochs
,
steps_per_epoch
)
dataset
,
feed_list
,
places
,
batch_size
,
epochs
,
steps_per_epoch
,
inputs
=
inputs
)
new_op_size
=
len
(
dist_main_block
.
ops
)
new_op_size
=
len
(
dist_main_block
.
ops
)
for
_
in
range
(
new_op_size
-
1
,
op_size
-
1
,
-
1
):
for
_
in
range
(
new_op_size
-
1
,
op_size
-
1
,
-
1
):
op
=
dist_main_block
.
ops
[
new_op_size
-
1
]
op
=
dist_main_block
.
ops
[
new_op_size
-
1
]
...
@@ -312,17 +399,49 @@ class Engine:
...
@@ -312,17 +399,49 @@ class Engine:
dist_main_block
.
_sync_with_cpp
()
dist_main_block
.
_sync_with_cpp
()
return
dataloader
return
dataloader
def
_init_communication
(
self
):
def
_validate_spec
(
self
,
specs
):
# Traverse different rank programs and traverse each op of them,
specs
=
to_list
(
specs
)
# instantiate communication by process_mapping.
if
specs
is
not
None
:
all_process_groups
=
get_all_process_groups
()
for
i
,
spec
in
enumerate
(
specs
):
for
process_group
in
all_process_groups
:
assert
isinstance
(
spec
,
InputSpec
)
if
self
.
_cur_rank
not
in
process_group
.
ranks
:
if
spec
.
name
is
None
:
continue
raise
ValueError
(
process_group
.
instantiate
()
"Requires Input[{}].name != None, but receive `None` with {}."
.
format
(
i
,
spec
))
return
specs
def
save
(
self
,
path
,
training
=
True
,
mode
=
None
):
if
not
mode
:
mode
=
self
.
mode
if
training
:
assert
'train'
in
self
.
_serial_main_progs
,
"training model is not ready, please call `engine.prepare(mode='train')` first."
serial_program
=
self
.
_serial_main_progs
[
"train"
]
dist_main_prog
=
self
.
_dist_main_progs
[
"train"
][
self
.
_cur_rank
]
dist_context
=
self
.
_dist_contexts
[
"train"
]
self
.
_saver
.
save
(
path
,
serial_program
=
serial_program
,
dist_main_program
=
dist_main_prog
,
dist_context
=
dist_context
)
else
:
assert
mode
,
"Please set the 'mode' you want to save."
feed_vars
=
self
.
_feed_vars
[
mode
][
'inputs'
]
fetch_vars
=
self
.
_fetch_vars
[
mode
][
'outputs'
]
dist_main_prog
=
self
.
_dist_main_progs
[
mode
][
self
.
_cur_rank
]
self
.
_saver
.
save_inference_model
(
path
,
feed_vars
,
fetch_vars
,
self
.
_executor
,
program
=
dist_main_prog
)
# def save(self, path, training=True):
def
load
(
self
,
path
,
strict
=
True
,
load_optimizer
=
True
,
mode
=
None
):
# pass
if
not
mode
:
mode
=
self
.
mode
assert
mode
,
"Please set the 'mode' you want to load."
# def load(self, path, strict=True, load_optimizer=True):
dist_main_prog
=
self
.
_dist_main_progs
[
mode
][
self
.
_cur_rank
]
# pass
dist_context
=
self
.
_dist_contexts
[
mode
]
self
.
_saver
.
load
(
path
,
dist_main_prog
,
dist_context
,
strict
,
load_optimizer
)
python/paddle/distributed/auto_parallel/utils.py
浏览文件 @
31363c3f
...
@@ -1416,3 +1416,11 @@ def set_dist_op_desc_original_id(dist_op_desc, op_desc, dist_context):
...
@@ -1416,3 +1416,11 @@ def set_dist_op_desc_original_id(dist_op_desc, op_desc, dist_context):
# Third, print error infomation if we cannot find the original id
# Third, print error infomation if we cannot find the original id
else
:
else
:
assert
False
,
"Cannot find the original id in the distributed context"
assert
False
,
"Cannot find the original id in the distributed context"
def
to_list
(
value
):
if
value
is
None
:
return
value
if
isinstance
(
value
,
(
list
,
tuple
)):
return
list
(
value
)
return
[
value
]
python/paddle/fluid/dygraph/varbase_patch_methods.py
浏览文件 @
31363c3f
...
@@ -797,6 +797,34 @@ def monkey_patch_varbase():
...
@@ -797,6 +797,34 @@ def monkey_patch_varbase():
def
value
(
self
):
def
value
(
self
):
return
self
return
self
@
framework
.
dygraph_only
def
_slice
(
self
,
begin_idx
,
end_idx
):
return
core
.
eager
.
Tensor
(
self
.
get_tensor
().
_slice
(
begin_idx
,
end_idx
))
@
framework
.
dygraph_only
def
_numel
(
self
):
return
self
.
get_tensor
().
_numel
()
@
framework
.
dygraph_only
def
cpu
(
self
):
if
self
.
place
.
is_cpu_place
():
return
self
else
:
res
=
self
.
_copy_to
(
core
.
CPUPlace
(),
True
)
res
.
stop_gradient
=
self
.
stop_gradient
res
.
persistable
=
self
.
persistable
return
res
@
framework
.
dygraph_only
def
cuda
(
self
,
device_id
,
blocking
):
if
self
.
place
.
is_gpu_place
():
return
self
else
:
res
=
self
.
_copy_to
(
core
.
CUDAPlace
(
device_id
),
True
)
res
.
stop_gradient
=
self
.
stop_gradient
res
.
persistable
=
self
.
persistable
return
res
if
core
.
_in_eager_mode
()
and
not
hasattr
(
core
,
"eager"
):
if
core
.
_in_eager_mode
()
and
not
hasattr
(
core
,
"eager"
):
return
return
...
@@ -820,6 +848,10 @@ def monkey_patch_varbase():
...
@@ -820,6 +848,10 @@ def monkey_patch_varbase():
setattr
(
core
.
eager
.
Tensor
,
"_set_grad_ivar"
,
_set_grad_ivar
)
setattr
(
core
.
eager
.
Tensor
,
"_set_grad_ivar"
,
_set_grad_ivar
)
setattr
(
core
.
eager
.
Tensor
,
"clone"
,
clone
)
setattr
(
core
.
eager
.
Tensor
,
"clone"
,
clone
)
setattr
(
core
.
eager
.
Tensor
,
"value"
,
value
)
setattr
(
core
.
eager
.
Tensor
,
"value"
,
value
)
setattr
(
core
.
eager
.
Tensor
,
"cpu"
,
cpu
)
setattr
(
core
.
eager
.
Tensor
,
"cuda"
,
cuda
)
setattr
(
core
.
eager
.
Tensor
,
"_slice"
,
_slice
)
setattr
(
core
.
eager
.
Tensor
,
"_numel"
,
_numel
)
else
:
else
:
setattr
(
core
.
VarBase
,
"__name__"
,
"Tensor"
)
setattr
(
core
.
VarBase
,
"__name__"
,
"Tensor"
)
setattr
(
core
.
VarBase
,
"grad"
,
grad
)
setattr
(
core
.
VarBase
,
"grad"
,
grad
)
...
...
python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
浏览文件 @
31363c3f
...
@@ -108,10 +108,8 @@ def train():
...
@@ -108,10 +108,8 @@ def train():
grad_clip
=
None
)
grad_clip
=
None
)
dataset
=
MyDataset
(
batch_num
*
batch_size
)
dataset
=
MyDataset
(
batch_num
*
batch_size
)
data_spec
=
[
inputs_spec
=
InputSpec
([
batch_size
,
hidden_size
],
'float32'
,
'x'
)
InputSpec
([
batch_size
,
hidden_size
],
'float32'
,
'x'
),
labels_spec
=
InputSpec
([
batch_size
],
'int64'
,
'label'
)
InputSpec
([
batch_size
],
'int64'
,
'label'
)
]
dist_strategy
=
fleet
.
DistributedStrategy
()
dist_strategy
=
fleet
.
DistributedStrategy
()
dist_strategy
.
amp
=
False
dist_strategy
.
amp
=
False
...
@@ -121,11 +119,18 @@ def train():
...
@@ -121,11 +119,18 @@ def train():
dist_strategy
.
semi_auto
=
True
dist_strategy
.
semi_auto
=
True
fleet
.
init
(
is_collective
=
True
,
strategy
=
dist_strategy
)
fleet
.
init
(
is_collective
=
True
,
strategy
=
dist_strategy
)
engine
=
Engine
(
mlp
,
data_spec
,
strategy
=
dist_strategy
)
engine
=
Engine
(
mlp
,
inputs_spec
=
inputs_spec
,
labels_spec
=
labels_spec
,
strategy
=
dist_strategy
)
engine
.
prepare
(
optimizer
,
loss
)
engine
.
prepare
(
optimizer
,
loss
)
engine
.
fit
(
dataset
,
engine
.
fit
(
dataset
,
batch_size
=
batch_size
,
batch_size
=
batch_size
,
steps_per_epoch
=
batch_num
*
batch_size
)
steps_per_epoch
=
batch_num
*
batch_size
)
engine
.
save
(
'./mlp'
)
engine
.
load
(
'./mlp'
)
engine
.
save
(
'./mlp_inf'
,
training
=
False
,
mode
=
'predict'
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py
0 → 100644
浏览文件 @
31363c3f
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
time
import
paddle.fluid
as
fluid
import
copy
import
os
import
numpy
as
np
import
subprocess
import
paddle
import
paddle.nn
as
nn
import
paddle.fluid
as
fluid
import
paddle.static
as
static
import
paddle.nn.functional
as
F
import
paddle.utils
as
utils
from
paddle.fluid
import
layers
from
paddle.io
import
Dataset
,
IterableDataset
,
DataLoader
from
paddle.static
import
InputSpec
from
paddle.distributed
import
fleet
import
paddle.distributed.auto_parallel
as
auto
from
paddle.distributed.auto_parallel.engine
import
Engine
paddle
.
enable_static
()
global_process_mesh
=
auto
.
ProcessMesh
(
mesh
=
[
0
,
1
])
batch_size
=
1
batch_num
=
10
hidden_size
=
1024
image_size
=
hidden_size
paddle
.
seed
(
44
)
class
MyDataset
(
Dataset
):
def
__init__
(
self
,
num_samples
):
super
(
MyDataset
,
self
).
__init__
()
self
.
num_samples
=
num_samples
def
__getitem__
(
self
,
index
):
input
=
np
.
random
.
uniform
(
size
=
image_size
).
astype
(
"float32"
)
return
input
def
__len__
(
self
):
return
self
.
num_samples
class
MLPLayer
(
nn
.
Layer
):
def
__init__
(
self
,
hidden_size
=
1024
,
intermediate_size
=
4
*
1024
,
dropout_ratio
=
0.1
,
initializer_range
=
0.02
):
super
(
MLPLayer
,
self
).
__init__
()
d_model
=
hidden_size
dim_feedforward
=
intermediate_size
weight_attr
=
paddle
.
ParamAttr
(
initializer
=
nn
.
initializer
.
Normal
(
mean
=
0.0
,
std
=
initializer_range
))
bias_attr
=
None
self
.
linear0
=
nn
.
Linear
(
d_model
,
dim_feedforward
,
weight_attr
,
bias_attr
=
bias_attr
)
self
.
linear1
=
nn
.
Linear
(
dim_feedforward
,
d_model
,
weight_attr
,
bias_attr
=
bias_attr
)
self
.
linear2
=
nn
.
Linear
(
d_model
,
1
,
weight_attr
,
bias_attr
=
bias_attr
)
self
.
norm
=
nn
.
LayerNorm
(
d_model
,
epsilon
=
1e-5
)
self
.
dropout
=
nn
.
Dropout
(
dropout_ratio
,
mode
=
"upscale_in_train"
)
def
forward
(
self
,
input
):
out
=
self
.
norm
(
input
)
out
=
self
.
linear0
(
input
)
auto
.
shard_tensor
(
self
.
linear0
.
weight
,
dist_attr
=
{
"process_mesh"
:
global_process_mesh
,
"dims_mapping"
:
[
-
1
,
0
]
})
out
=
F
.
gelu
(
out
,
approximate
=
True
)
out
=
self
.
linear1
(
out
)
auto
.
shard_tensor
(
self
.
linear1
.
weight
,
dist_attr
=
{
"process_mesh"
:
global_process_mesh
,
"dims_mapping"
:
[
0
,
-
1
]
})
out
=
self
.
dropout
(
out
)
out
=
self
.
linear2
(
out
)
return
out
def
train
():
mlp
=
MLPLayer
(
hidden_size
=
hidden_size
,
intermediate_size
=
4
*
hidden_size
,
dropout_ratio
=
0.1
,
initializer_range
=
0.02
)
dataset
=
MyDataset
(
batch_num
*
batch_size
)
inputs_spec
=
InputSpec
([
batch_size
,
hidden_size
],
'float32'
,
'x'
)
dist_strategy
=
fleet
.
DistributedStrategy
()
# init parallel optimizer
dist_strategy
.
semi_auto
=
True
fleet
.
init
(
is_collective
=
True
,
strategy
=
dist_strategy
)
engine
=
Engine
(
mlp
,
inputs_spec
=
inputs_spec
,
strategy
=
dist_strategy
)
engine
.
prepare
(
mode
=
'predict'
)
engine
.
predict
(
dataset
,
batch_size
=
batch_size
)
if
__name__
==
"__main__"
:
train
()
python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
浏览文件 @
31363c3f
...
@@ -42,6 +42,34 @@ class TestEngineAPI(unittest.TestCase):
...
@@ -42,6 +42,34 @@ class TestEngineAPI(unittest.TestCase):
log_path
=
os
.
path
.
join
(
file_dir
,
"log"
)
log_path
=
os
.
path
.
join
(
file_dir
,
"log"
)
if
os
.
path
.
exists
(
log_path
):
if
os
.
path
.
exists
(
log_path
):
shutil
.
rmtree
(
log_path
)
shutil
.
rmtree
(
log_path
)
files_path
=
[
path
for
path
in
os
.
listdir
(
'.'
)
if
'.pd'
in
path
]
for
path
in
files_path
:
if
os
.
path
.
exists
(
path
):
os
.
remove
(
path
)
if
os
.
path
.
exists
(
'rank_mapping.csv'
):
os
.
remove
(
'rank_mapping.csv'
)
def
test_engine_predict
(
self
):
file_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
launch_model_path
=
os
.
path
.
join
(
file_dir
,
"engine_predict_api.py"
)
if
os
.
environ
.
get
(
"WITH_COVERAGE"
,
"OFF"
)
==
"ON"
:
coverage_args
=
[
"-m"
,
"coverage"
,
"run"
,
"--branch"
,
"-p"
]
else
:
coverage_args
=
[]
cmd
=
[
sys
.
executable
,
"-u"
]
+
coverage_args
+
[
"-m"
,
"launch"
,
"--gpus"
,
"0,1"
,
launch_model_path
]
process
=
subprocess
.
Popen
(
cmd
)
process
.
wait
()
self
.
assertEqual
(
process
.
returncode
,
0
)
# Remove unnecessary files
log_path
=
os
.
path
.
join
(
file_dir
,
"log"
)
if
os
.
path
.
exists
(
log_path
):
shutil
.
rmtree
(
log_path
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_egr_python_api.py
浏览文件 @
31363c3f
...
@@ -634,20 +634,39 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
...
@@ -634,20 +634,39 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
tensor3
=
tensor2
.
_copy_to
(
core
.
CUDAPlace
(
0
),
True
)
tensor3
=
tensor2
.
_copy_to
(
core
.
CUDAPlace
(
0
),
True
)
self
.
assertTrue
(
np
.
array_equal
(
tensor3
.
numpy
(),
arr2
))
self
.
assertTrue
(
np
.
array_equal
(
tensor3
.
numpy
(),
arr2
))
self
.
assert
True
(
tensor3
.
persistable
,
True
)
self
.
assert
Equal
(
tensor3
.
persistable
,
True
)
self
.
assert
True
(
tensor3
.
stop_gradient
,
True
)
self
.
assert
Equal
(
tensor3
.
stop_gradient
,
True
)
self
.
assertTrue
(
tensor3
.
place
.
is_gpu_place
())
self
.
assertTrue
(
tensor3
.
place
.
is_gpu_place
())
tensor4
=
paddle
.
to_tensor
([
1
,
2
,
3
],
place
=
'gpu_pinned'
)
tensor5
=
tensor4
.
_copy_to
(
core
.
CUDAPlace
(
0
),
True
)
tensor4
=
tensor2
.
cuda
(
0
,
True
)
self
.
assertTrue
(
np
.
array_equal
(
tensor4
.
numpy
(),
arr2
))
self
.
assertEqual
(
tensor4
.
persistable
,
True
)
self
.
assertEqual
(
tensor4
.
stop_gradient
,
False
)
self
.
assertTrue
(
tensor4
.
place
.
is_gpu_place
())
tensor5
=
tensor4
.
cpu
()
self
.
assertTrue
(
np
.
array_equal
(
tensor5
.
numpy
(),
arr2
))
self
.
assertEqual
(
tensor5
.
persistable
,
True
)
self
.
assertEqual
(
tensor5
.
stop_gradient
,
False
)
self
.
assertTrue
(
tensor5
.
place
.
is_cpu_place
())
tensor10
=
paddle
.
to_tensor
([
1
,
2
,
3
],
place
=
'gpu_pinned'
)
tensor11
=
tensor10
.
_copy_to
(
core
.
CUDAPlace
(
0
),
True
)
self
.
assertTrue
(
self
.
assertTrue
(
np
.
array_equal
(
tensor
4
.
numpy
(),
tensor5
.
numpy
()))
np
.
array_equal
(
tensor
10
.
numpy
(),
tensor11
.
numpy
()))
else
:
else
:
tensor3
=
tensor2
.
_copy_to
(
core
.
CPUPlace
(),
True
)
tensor3
=
tensor2
.
_copy_to
(
core
.
CPUPlace
(),
True
)
self
.
assertTrue
(
np
.
array_equal
(
tensor3
.
numpy
(),
arr2
))
self
.
assertTrue
(
np
.
array_equal
(
tensor3
.
numpy
(),
arr2
))
self
.
assert
True
(
tensor3
.
persistable
,
True
)
self
.
assert
Equal
(
tensor3
.
persistable
,
True
)
self
.
assert
True
(
tensor3
.
stop_gradient
,
True
)
self
.
assert
Equal
(
tensor3
.
stop_gradient
,
True
)
self
.
assertTrue
(
tensor3
.
place
.
is_cpu_place
())
self
.
assertTrue
(
tensor3
.
place
.
is_cpu_place
())
tensor4
=
tensor2
.
cpu
()
self
.
assertTrue
(
np
.
array_equal
(
tensor4
.
numpy
(),
arr2
))
self
.
assertEqual
(
tensor4
.
persistable
,
True
)
self
.
assertEqual
(
tensor4
.
stop_gradient
,
False
)
self
.
assertTrue
(
tensor4
.
place
.
is_cpu_place
())
def
test_share_buffer_to
(
self
):
def
test_share_buffer_to
(
self
):
with
_test_eager_guard
():
with
_test_eager_guard
():
arr
=
np
.
ones
([
4
,
16
,
16
,
32
]).
astype
(
'float32'
)
arr
=
np
.
ones
([
4
,
16
,
16
,
32
]).
astype
(
'float32'
)
...
@@ -784,6 +803,34 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
...
@@ -784,6 +803,34 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
self
.
assertEqual
(
egr_tensor
.
shape
,
[
4
,
16
,
16
,
32
])
self
.
assertEqual
(
egr_tensor
.
shape
,
[
4
,
16
,
16
,
32
])
self
.
assertTrue
(
np
.
array_equal
(
egr_tensor
.
numpy
(),
new_arr
))
self
.
assertTrue
(
np
.
array_equal
(
egr_tensor
.
numpy
(),
new_arr
))
def
test_sharding_related_api
(
self
):
with
_test_eager_guard
():
arr0
=
np
.
random
.
rand
(
4
,
16
,
16
,
32
).
astype
(
'float32'
)
egr_tensor1
=
core
.
eager
.
Tensor
(
arr0
,
core
.
CPUPlace
(),
True
,
False
,
"numpy_tensor1"
,
False
)
self
.
assertEqual
(
egr_tensor1
.
_numel
(),
32768
)
self
.
assertEqual
(
egr_tensor1
.
_slice
(
0
,
2
).
_numel
(),
16384
)
def
test_copy_gradient_from
(
self
):
with
_test_eager_guard
():
np_x
=
np
.
random
.
random
((
2
,
2
))
np_y
=
np
.
random
.
random
((
2
,
2
))
x
=
paddle
.
to_tensor
(
np_x
,
dtype
=
"float64"
,
stop_gradient
=
False
)
y
=
paddle
.
to_tensor
(
np_y
,
dtype
=
"float64"
)
out
=
x
+
x
out
.
backward
()
x
.
_copy_gradient_from
(
y
)
self
.
assertTrue
(
np
.
array_equal
(
x
.
grad
.
numpy
(),
np_y
))
def
test_clear
(
self
):
with
_test_eager_guard
():
np_x
=
np
.
random
.
random
((
3
,
8
,
8
))
x
=
paddle
.
to_tensor
(
np_x
,
dtype
=
"float64"
)
self
.
assertTrue
(
x
.
_is_initialized
())
x
.
_clear
()
self
.
assertFalse
(
x
.
_is_initialized
())
class
EagerParamBaseUsageTestCase
(
unittest
.
TestCase
):
class
EagerParamBaseUsageTestCase
(
unittest
.
TestCase
):
def
test_print
(
self
):
def
test_print
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
浏览文件 @
31363c3f
...
@@ -170,6 +170,180 @@ class TestDygraphInplace(unittest.TestCase):
...
@@ -170,6 +170,180 @@ class TestDygraphInplace(unittest.TestCase):
grad_var_a
=
var_a
.
grad
.
numpy
()
grad_var_a
=
var_a
.
grad
.
numpy
()
self
.
assertTrue
(
np
.
array_equal
(
grad_var_a_inplace
,
grad_var_a
))
self
.
assertTrue
(
np
.
array_equal
(
grad_var_a_inplace
,
grad_var_a
))
# inplace + hook
def
test_backward_success_3
(
self
):
# var_b is modified inplace before using it, the inplace operator doesn't result
# in incorrect gradient computation.
def
double_hook
(
grad
):
grad
=
grad
*
2
return
grad
grad_var_a
,
grad_var_a_inplace
=
0
,
1
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
helper
=
var_a
.
register_hook
(
double_hook
)
var_b
=
var_a
**
2
var_c
=
self
.
inplace_api_processing
(
var_b
)
# var_b is modified inplace before using it
# Here, the gradient computation will use the value of var_b
var_d
=
var_c
**
2
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a_inplace
=
var_a
.
grad
.
numpy
()
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
helper
=
var_a
.
register_hook
(
double_hook
)
var_b
=
var_a
**
2
var_c
=
self
.
non_inplace_api_processing
(
var_b
)
var_d
=
var_c
**
2
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a
=
var_a
.
grad
.
numpy
()
self
.
assertTrue
(
self
.
np_compare
(
grad_var_a_inplace
,
grad_var_a
))
# inplace + hook
def
test_backward_success_4
(
self
):
# Although var_b is modified inplace after using it, it does not used in gradient computation.
# The inplace operator doesn't result in incorrect gradient computation.
def
double_hook
(
grad
):
grad
=
grad
*
2
return
grad
grad_var_a
,
grad_var_a_inplace
=
0
,
1
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
var_a
.
register_hook
(
double_hook
)
var_b
=
var_a
**
2
var_c
=
self
.
inplace_api_processing
(
var_b
)
# var_b is modified inplace before using it
var_d
=
var_c
+
var_c
# Here, the grad op of sum doesn't use the value of var_b
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a_inplace
=
var_a
.
grad
.
numpy
()
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
var_a
.
register_hook
(
double_hook
)
var_b
=
var_a
**
2
var_c
=
self
.
non_inplace_api_processing
(
var_b
)
# var_b is modified inplace before using it
var_d
=
var_c
+
var_c
# Here, the grad op of sum doesn't use the value of var_b
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a
=
var_a
.
grad
.
numpy
()
self
.
assertTrue
(
np
.
array_equal
(
grad_var_a_inplace
,
grad_var_a
))
# inplace + hook
def
test_backward_success_5
(
self
):
# var_b is modified inplace before using it, the inplace operator doesn't result
# in incorrect gradient computation.
def
double_hook
(
grad
):
grad
=
grad
*
2
return
grad
grad_var_a
,
grad_var_a_inplace
=
0
,
1
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
var_b
=
var_a
**
2
var_b
.
register_hook
(
double_hook
)
var_c
=
self
.
inplace_api_processing
(
var_b
)
# var_b is modified inplace before using it
# Here, the gradient computation will use the value of var_b
var_d
=
var_c
**
2
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a_inplace
=
var_a
.
grad
.
numpy
()
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
var_b
=
var_a
**
2
var_b
.
register_hook
(
double_hook
)
var_c
=
self
.
non_inplace_api_processing
(
var_b
)
var_d
=
var_c
**
2
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a
=
var_a
.
grad
.
numpy
()
self
.
assertTrue
(
self
.
np_compare
(
grad_var_a_inplace
,
grad_var_a
))
# inplace + hook
def
test_backward_success_6
(
self
):
# Although var_b is modified inplace before using it, it does not used in gradient computation.
# The inplace operator doesn't result in incorrect gradient computation.
def
double_hook
(
grad
):
grad
=
grad
*
2
return
grad
grad_var_a
,
grad_var_a_inplace
=
0
,
1
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
var_b
=
var_a
**
2
var_b
.
register_hook
(
double_hook
)
var_c
=
self
.
inplace_api_processing
(
var_b
)
# var_b is modified inplace before using it
var_d
=
var_c
+
var_c
# Here, the grad op of sum doesn't use the value of var_b
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a_inplace
=
var_a
.
grad
.
numpy
()
with
paddle
.
fluid
.
dygraph
.
guard
():
with
_test_eager_guard
():
var_a
=
paddle
.
to_tensor
(
self
.
input_var_numpy
).
astype
(
self
.
dtype
)
var_a
.
stop_gradient
=
False
var_b
=
var_a
**
2
var_b
.
register_hook
(
double_hook
)
var_c
=
self
.
non_inplace_api_processing
(
var_b
)
# var_b is modified inplace before using it
var_d
=
var_c
+
var_c
# Here, the grad op of sum doesn't use the value of var_b
loss
=
var_d
.
sum
()
loss
.
backward
()
grad_var_a
=
var_a
.
grad
.
numpy
()
self
.
assertTrue
(
np
.
array_equal
(
grad_var_a_inplace
,
grad_var_a
))
class
TestDygraphInplaceUnsqueeze
(
TestDygraphInplace
):
class
TestDygraphInplaceUnsqueeze
(
TestDygraphInplace
):
def
non_inplace_api_processing
(
self
,
var
):
def
non_inplace_api_processing
(
self
,
var
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录