Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
b106c424
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b106c424
编写于
9月 27, 2022
作者:
W
wanghuancoder
提交者:
GitHub
9月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Eager] refine gil use (#46452)
* refine gil use
上级
a02eb143
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
487 addition
and
428 deletion
+487
-428
paddle/fluid/eager/pylayer/py_layer_node.cc
paddle/fluid/eager/pylayer/py_layer_node.cc
+5
-0
paddle/fluid/eager/pylayer/py_layer_node.h
paddle/fluid/eager/pylayer/py_layer_node.h
+1
-1
paddle/fluid/pybind/eager_functions.cc
paddle/fluid/pybind/eager_functions.cc
+420
-391
paddle/fluid/pybind/eager_method.cc
paddle/fluid/pybind/eager_method.cc
+61
-36
未找到文件。
paddle/fluid/eager/pylayer/py_layer_node.cc
浏览文件 @
b106c424
...
...
@@ -27,6 +27,11 @@
#include "pybind11/pytypes.h"
namespace
egr
{
GradNodePyLayer
::~
GradNodePyLayer
()
{
pybind11
::
gil_scoped_acquire
gil
;
Py_XDECREF
(
ctx_
);
}
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
kSlotSmallVectorSize
>
GradNodePyLayer
::
operator
()(
...
...
paddle/fluid/eager/pylayer/py_layer_node.h
浏览文件 @
b106c424
...
...
@@ -34,7 +34,7 @@ class GradNodePyLayer : public GradNodeBase {
Py_INCREF
(
ctx_
);
}
~
GradNodePyLayer
()
override
{
Py_XDECREF
(
ctx_
);
}
;
~
GradNodePyLayer
()
override
;
virtual
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
kSlotSmallVectorSize
>
...
...
paddle/fluid/pybind/eager_functions.cc
浏览文件 @
b106c424
...
...
@@ -107,12 +107,18 @@ static PyObject* eager_api_scale(PyObject* self,
PyObject
*
kwargs
)
{
EAGER_TRY
// TODO(jiabin): Sync Tensor and Variable here when we support
paddle
::
experimental
::
Tensor
ret
=
egr
::
scale
(
reinterpret_cast
<
TensorObject
*>
(
PyTuple_GET_ITEM
(
args
,
0
))
->
tensor
,
CastPyArg2AttrFloat
(
PyTuple_GET_ITEM
(
args
,
1
),
1
),
CastPyArg2AttrFloat
(
PyTuple_GET_ITEM
(
args
,
2
),
2
),
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
3
),
3
),
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
4
),
4
));
auto
&
tensor
=
reinterpret_cast
<
TensorObject
*>
(
PyTuple_GET_ITEM
(
args
,
0
))
->
tensor
;
float
scale
=
CastPyArg2AttrFloat
(
PyTuple_GET_ITEM
(
args
,
1
),
1
);
float
bias
=
CastPyArg2AttrFloat
(
PyTuple_GET_ITEM
(
args
,
2
),
2
);
bool
bias_after_scale
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
3
),
3
);
bool
trace_backward
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
4
),
4
);
paddle
::
experimental
::
Tensor
ret
;
{
eager_gil_scoped_release
guard
;
ret
=
egr
::
scale
(
tensor
,
scale
,
bias
,
bias_after_scale
,
trace_backward
);
}
return
ToPyObject
(
ret
);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
@@ -123,11 +129,10 @@ static PyObject* eager_api_run_backward(PyObject* self,
EAGER_TRY
auto
tensors
=
CastPyArg2VectorOfTensor
(
PyTuple_GET_ITEM
(
args
,
0
),
0
);
auto
grad_tensors
=
CastPyArg2VectorOfTensor
(
PyTuple_GET_ITEM
(
args
,
1
),
1
);
bool
retain_graph
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
2
),
2
);
{
eager_gil_scoped_release
guard
;
egr
::
Backward
(
tensors
,
grad_tensors
,
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
2
),
2
));
egr
::
Backward
(
tensors
,
grad_tensors
,
retain_graph
);
}
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
...
...
@@ -156,8 +161,8 @@ static PyObject* eager_api_run_partial_grad(PyObject* self,
only_inputs
,
allow_unused
,
no_grad_vars
);
VLOG
(
1
)
<<
" in eager_api_run_partial_grad, after runing egr::Grad"
;
}
VLOG
(
1
)
<<
" in eager_api_run_partial_grad, after runing egr::Grad"
;
return
ToPyObject
(
result
,
true
/* return_py_none_if_not_initialize */
);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
@@ -173,11 +178,14 @@ static PyObject* eager_api_tensor_copy(PyObject* self,
auto
place
=
CastPyArg2Place
(
PyTuple_GET_ITEM
(
args
,
2
),
2
);
bool
blocking
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
3
),
3
);
dst
=
src
.
copy_to
(
place
,
blocking
);
egr
::
EagerUtils
::
autograd_meta
(
&
dst
)
->
SetStopGradient
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
src
))
->
StopGradient
());
egr
::
EagerUtils
::
autograd_meta
(
&
dst
)
->
SetPersistable
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
src
))
->
Persistable
());
{
eager_gil_scoped_release
guard
;
dst
=
src
.
copy_to
(
place
,
blocking
);
egr
::
EagerUtils
::
autograd_meta
(
&
dst
)
->
SetStopGradient
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
src
))
->
StopGradient
());
egr
::
EagerUtils
::
autograd_meta
(
&
dst
)
->
SetPersistable
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
src
))
->
Persistable
());
}
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
@@ -378,7 +386,11 @@ static PyObject* eager_api_jit_function_call(PyObject* self,
CastPyArg2JitFunction
(
PyTuple_GET_ITEM
(
args
,
0
),
0
);
std
::
vector
<
paddle
::
experimental
::
Tensor
>
ins
=
CastPyArg2VectorOfTensor
(
PyTuple_GET_ITEM
(
args
,
1
),
1
);
std
::
vector
<
paddle
::
experimental
::
Tensor
>
outs
=
(
*
function
)(
ins
);
std
::
vector
<
paddle
::
experimental
::
Tensor
>
outs
;
{
eager_gil_scoped_release
guard
;
outs
=
(
*
function
)(
ins
);
}
return
ToPyObject
(
outs
);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
@@ -391,116 +403,120 @@ static PyObject* eager_api_run_costum_op(PyObject* self,
CastPyArg2CustomOpKernelContext
(
PyTuple_GET_ITEM
(
args
,
0
),
0
);
std
::
string
op_type
=
CastPyArg2AttrString
(
PyTuple_GET_ITEM
(
args
,
1
),
1
);
bool
trace_backward
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
2
),
2
);
VLOG
(
7
)
<<
"Get things for python for Custom Op: "
<<
op_type
<<
", trace_backward is: "
<<
trace_backward
;
auto
meta_info_map
=
egr
::
Controller
::
Instance
().
GetOpMetaInfoMap
();
PADDLE_ENFORCE_NE
(
meta_info_map
.
find
(
op_type
),
meta_info_map
.
end
(),
paddle
::
platform
::
errors
::
NotFound
(
"Can't find %s in Eager OpMetaInfoMap which should be "
"created by LoadOpMetaInfoAndRegisterOp, please make "
"sure you registered your op first and try again. "
,
op_type
));
VLOG
(
7
)
<<
"Run Kernel of Custom Op: "
<<
op_type
;
std
::
vector
<
paddle
::
any
>
res_attrs
=
CastAttrsToTragetType
(
ctx
.
Attrs
(),
paddle
::
framework
::
OpMetaInfoHelper
::
GetAttrs
(
meta_info_map
.
at
(
op_type
)[
0
]));
ctx
.
EmplaceBackAttrs
(
res_attrs
);
const
auto
&
vec_map
=
meta_info_map
.
at
(
op_type
);
(
*
paddle
::
framework
::
OpMetaInfoHelper
::
GetKernelFn
(
vec_map
[
0
]))(
&
ctx
);
{
eager_gil_scoped_release
guard
;
VLOG
(
7
)
<<
"Get things for python for Custom Op: "
<<
op_type
<<
", trace_backward is: "
<<
trace_backward
;
auto
meta_info_map
=
egr
::
Controller
::
Instance
().
GetOpMetaInfoMap
();
PADDLE_ENFORCE_NE
(
meta_info_map
.
find
(
op_type
),
meta_info_map
.
end
(),
paddle
::
platform
::
errors
::
NotFound
(
"Can't find %s in Eager OpMetaInfoMap which should be "
"created by LoadOpMetaInfoAndRegisterOp, please make "
"sure you registered your op first and try again. "
,
op_type
));
VLOG
(
7
)
<<
"Run Kernel of Custom Op: "
<<
op_type
;
std
::
vector
<
paddle
::
any
>
res_attrs
=
CastAttrsToTragetType
(
ctx
.
Attrs
(),
paddle
::
framework
::
OpMetaInfoHelper
::
GetAttrs
(
meta_info_map
.
at
(
op_type
)[
0
]));
ctx
.
EmplaceBackAttrs
(
res_attrs
);
const
auto
&
vec_map
=
meta_info_map
.
at
(
op_type
);
(
*
paddle
::
framework
::
OpMetaInfoHelper
::
GetKernelFn
(
vec_map
[
0
]))(
&
ctx
);
VLOG
(
7
)
<<
"Get AutogradMeta for inputs and outputs for Custom Op"
;
std
::
vector
<
std
::
vector
<
egr
::
AutogradMeta
*>>
ins_auto_grad_metas
;
std
::
vector
<
std
::
vector
<
egr
::
AutogradMeta
*>>
outs_auto_grad_metas
;
VLOG
(
7
)
<<
"We got slot num of ins is: "
<<
ctx
.
InputRange
().
size
();
ins_auto_grad_metas
.
resize
(
ctx
.
InputRange
().
size
());
VLOG
(
7
)
<<
"We got slot num of outs is: "
<<
ctx
.
OutputRange
().
size
();
outs_auto_grad_metas
.
resize
(
ctx
.
OutputRange
().
size
());
VLOG
(
7
)
<<
"Get AutogradMeta for inputs and outputs for Custom Op"
;
std
::
vector
<
std
::
vector
<
egr
::
AutogradMeta
*>>
ins_auto_grad_metas
;
std
::
vector
<
std
::
vector
<
egr
::
AutogradMeta
*>>
outs_auto_grad_metas
;
VLOG
(
7
)
<<
"We got slot num of ins is: "
<<
ctx
.
InputRange
().
size
();
ins_auto_grad_metas
.
resize
(
ctx
.
InputRange
().
size
());
VLOG
(
7
)
<<
"We got slot num of outs is: "
<<
ctx
.
OutputRange
().
size
();
outs_auto_grad_metas
.
resize
(
ctx
.
OutputRange
().
size
());
for
(
size_t
i
=
0
;
i
<
ctx
.
InputRange
().
size
();
i
++
)
{
ins_auto_grad_metas
[
i
]
=
egr
::
EagerUtils
::
nullable_autograd_meta
(
ctx
.
InputsBetween
(
ctx
.
InputRangeAt
(
i
).
first
,
ctx
.
InputRangeAt
(
i
).
second
));
}
for
(
size_t
i
=
0
;
i
<
ctx
.
OutputRange
().
size
();
i
++
)
{
outs_auto_grad_metas
[
i
]
=
egr
::
EagerUtils
::
unsafe_autograd_meta
(
ctx
.
OutputsBetweeen
(
ctx
.
OutputRangeAt
(
i
).
first
,
ctx
.
OutputRangeAt
(
i
).
second
));
}
bool
require_any_grad
=
false
;
for
(
size_t
i
=
0
;
i
<
ins_auto_grad_metas
.
size
();
i
++
)
{
require_any_grad
=
require_any_grad
||
egr
::
EagerUtils
::
ComputeRequireGrad
(
trace_backward
,
&
(
ins_auto_grad_metas
[
i
]));
}
if
(
require_any_grad
&&
(
vec_map
.
size
()
>
1
))
{
VLOG
(
6
)
<<
" Construct Grad for Custom Op: "
<<
op_type
;
ConstructFwdAndBwdMap
(
vec_map
,
op_type
);
for
(
size_t
i
=
0
;
i
<
outs_auto_grad_metas
.
size
();
i
++
)
{
egr
::
EagerUtils
::
PassStopGradient
(
false
,
&
(
outs_auto_grad_metas
[
i
]));
for
(
size_t
i
=
0
;
i
<
ctx
.
InputRange
().
size
();
i
++
)
{
ins_auto_grad_metas
[
i
]
=
egr
::
EagerUtils
::
nullable_autograd_meta
(
ctx
.
InputsBetween
(
ctx
.
InputRangeAt
(
i
).
first
,
ctx
.
InputRangeAt
(
i
).
second
));
}
for
(
size_t
i
=
0
;
i
<
ctx
.
OutputRange
().
size
();
i
++
)
{
outs_auto_grad_metas
[
i
]
=
egr
::
EagerUtils
::
unsafe_autograd_meta
(
ctx
.
OutputsBetweeen
(
ctx
.
OutputRangeAt
(
i
).
first
,
ctx
.
OutputRangeAt
(
i
).
second
));
}
auto
grad_node
=
std
::
make_shared
<
egr
::
RunCustomOpNode
>
(
outs_auto_grad_metas
.
size
(),
ins_auto_grad_metas
.
size
(),
op_type
);
auto
slot_map
=
egr
::
Controller
::
Instance
().
GetCustomEdgesSlotMap
().
at
(
op_type
);
// Prepare Grad outputs
size_t
no_grad_cnt
=
0
;
bool
require_any_grad
=
false
;
for
(
size_t
i
=
0
;
i
<
ins_auto_grad_metas
.
size
();
i
++
)
{
const
std
::
vector
<
paddle
::
experimental
::
Tensor
>&
in_tensors
=
ctx
.
InputsBetween
(
ctx
.
InputRangeAt
(
i
).
first
,
ctx
.
InputRangeAt
(
i
).
second
);
require_any_grad
=
require_any_grad
||
egr
::
EagerUtils
::
ComputeRequireGrad
(
trace_backward
,
&
(
ins_auto_grad_metas
[
i
]));
}
if
(
require_any_grad
&&
(
vec_map
.
size
()
>
1
))
{
VLOG
(
6
)
<<
" Construct Grad for Custom Op: "
<<
op_type
;
ConstructFwdAndBwdMap
(
vec_map
,
op_type
);
for
(
size_t
i
=
0
;
i
<
outs_auto_grad_metas
.
size
();
i
++
)
{
egr
::
EagerUtils
::
PassStopGradient
(
false
,
&
(
outs_auto_grad_metas
[
i
]));
}
auto
grad_node
=
std
::
make_shared
<
egr
::
RunCustomOpNode
>
(
outs_auto_grad_metas
.
size
(),
ins_auto_grad_metas
.
size
(),
op_type
);
auto
slot_map
=
egr
::
Controller
::
Instance
().
GetCustomEdgesSlotMap
().
at
(
op_type
);
// Prepare Grad outputs
size_t
no_grad_cnt
=
0
;
for
(
size_t
i
=
0
;
i
<
ins_auto_grad_metas
.
size
();
i
++
)
{
const
std
::
vector
<
paddle
::
experimental
::
Tensor
>&
in_tensors
=
ctx
.
InputsBetween
(
ctx
.
InputRangeAt
(
i
).
first
,
ctx
.
InputRangeAt
(
i
).
second
);
if
(
slot_map
[
0
][
0
].
find
(
i
)
!=
slot_map
[
0
][
0
].
end
())
{
grad_node
->
SetGradOutMeta
(
in_tensors
,
slot_map
[
0
][
0
][
i
]);
}
else
{
grad_node
->
SetGradOutMeta
(
in_tensors
,
ins_auto_grad_metas
.
size
()
-
1
-
no_grad_cnt
);
no_grad_cnt
++
;
if
(
slot_map
[
0
][
0
].
find
(
i
)
!=
slot_map
[
0
][
0
].
end
())
{
grad_node
->
SetGradOutMeta
(
in_tensors
,
slot_map
[
0
][
0
][
i
]);
}
else
{
grad_node
->
SetGradOutMeta
(
in_tensors
,
ins_auto_grad_metas
.
size
()
-
1
-
no_grad_cnt
);
no_grad_cnt
++
;
}
}
}
// Prepare Grad inputs with grad of fwd outputs
for
(
size_t
i
=
0
;
i
<
outs_auto_grad_metas
.
size
();
i
++
)
{
const
std
::
vector
<
paddle
::
experimental
::
Tensor
>&
out_tensors
=
ctx
.
OutputsBetweeen
(
ctx
.
OutputRangeAt
(
i
).
first
,
ctx
.
OutputRangeAt
(
i
).
second
);
// Prepare Grad inputs with grad of fwd outputs
for
(
size_t
i
=
0
;
i
<
outs_auto_grad_metas
.
size
();
i
++
)
{
const
std
::
vector
<
paddle
::
experimental
::
Tensor
>&
out_tensors
=
ctx
.
OutputsBetweeen
(
ctx
.
OutputRangeAt
(
i
).
first
,
ctx
.
OutputRangeAt
(
i
).
second
);
egr
::
EagerUtils
::
SetOutRankWithSlot
(
&
(
outs_auto_grad_metas
[
i
]),
i
);
egr
::
EagerUtils
::
SetHistory
(
&
(
outs_auto_grad_metas
[
i
]),
grad_node
);
grad_node
->
SetGradInMeta
(
out_tensors
,
i
);
egr
::
EagerUtils
::
CheckAndRetainGrad
(
out_tensors
);
}
egr
::
EagerUtils
::
SetOutRankWithSlot
(
&
(
outs_auto_grad_metas
[
i
]),
i
);
egr
::
EagerUtils
::
SetHistory
(
&
(
outs_auto_grad_metas
[
i
]),
grad_node
);
grad_node
->
SetGradInMeta
(
out_tensors
,
i
);
egr
::
EagerUtils
::
CheckAndRetainGrad
(
out_tensors
);
}
// Prepare Grad inputs with fwd outputs
for
(
auto
it
=
slot_map
[
0
][
2
].
begin
();
it
!=
slot_map
[
0
][
2
].
end
();
it
++
)
{
VLOG
(
7
)
<<
"Prepare fwd_outs: "
<<
it
->
first
<<
" to grad_inputs: "
<<
it
->
second
;
grad_node
->
fwd_outs
[
it
->
second
]
=
egr
::
RunCustomOpNode
::
ConstructTensorWrapper
(
ctx
.
OutputsBetweeen
(
ctx
.
OutputRangeAt
(
it
->
first
).
first
,
ctx
.
OutputRangeAt
(
it
->
first
).
second
));
}
// Prepare Grad inputs with fwd outputs
for
(
auto
it
=
slot_map
[
0
][
2
].
begin
();
it
!=
slot_map
[
0
][
2
].
end
();
it
++
)
{
VLOG
(
7
)
<<
"Prepare fwd_outs: "
<<
it
->
first
<<
" to grad_inputs: "
<<
it
->
second
;
grad_node
->
fwd_outs
[
it
->
second
]
=
egr
::
RunCustomOpNode
::
ConstructTensorWrapper
(
ctx
.
OutputsBetweeen
(
ctx
.
OutputRangeAt
(
it
->
first
).
first
,
ctx
.
OutputRangeAt
(
it
->
first
).
second
));
}
// Prepare Grad inputs with fwd inputs
for
(
auto
it
=
slot_map
[
0
][
3
].
begin
();
it
!=
slot_map
[
0
][
3
].
end
();
it
++
)
{
VLOG
(
7
)
<<
"Prepare fwd_ins: "
<<
it
->
first
<<
" to grad_inputs: "
<<
it
->
second
;
grad_node
->
fwd_ins
[
it
->
second
]
=
egr
::
RunCustomOpNode
::
ConstructTensorWrapper
(
ctx
.
InputsBetween
(
ctx
.
InputRangeAt
(
it
->
first
).
first
,
ctx
.
InputRangeAt
(
it
->
first
).
second
));
}
// Prepare Grad inputs with fwd inputs
for
(
auto
it
=
slot_map
[
0
][
3
].
begin
();
it
!=
slot_map
[
0
][
3
].
end
();
it
++
)
{
VLOG
(
7
)
<<
"Prepare fwd_ins: "
<<
it
->
first
<<
" to grad_inputs: "
<<
it
->
second
;
grad_node
->
fwd_ins
[
it
->
second
]
=
egr
::
RunCustomOpNode
::
ConstructTensorWrapper
(
ctx
.
InputsBetween
(
ctx
.
InputRangeAt
(
it
->
first
).
first
,
ctx
.
InputRangeAt
(
it
->
first
).
second
));
}
auto
attrs_names
=
paddle
::
framework
::
OpMetaInfoHelper
::
GetAttrs
(
meta_info_map
.
at
(
op_type
)[
1
]);
std
::
vector
<
paddle
::
any
>
attrs
(
attrs_names
.
size
());
// Prepare attrs for Grad node
for
(
auto
it
=
slot_map
[
0
][
4
].
begin
();
it
!=
slot_map
[
0
][
4
].
end
();
it
++
)
{
VLOG
(
7
)
<<
"Prepare fwd attrs: "
<<
it
->
first
<<
" to grad_attrs: "
<<
it
->
second
;
attrs
[
it
->
second
]
=
res_attrs
[
it
->
first
];
auto
attrs_names
=
paddle
::
framework
::
OpMetaInfoHelper
::
GetAttrs
(
meta_info_map
.
at
(
op_type
)[
1
]);
std
::
vector
<
paddle
::
any
>
attrs
(
attrs_names
.
size
());
// Prepare attrs for Grad node
for
(
auto
it
=
slot_map
[
0
][
4
].
begin
();
it
!=
slot_map
[
0
][
4
].
end
();
it
++
)
{
VLOG
(
7
)
<<
"Prepare fwd attrs: "
<<
it
->
first
<<
" to grad_attrs: "
<<
it
->
second
;
attrs
[
it
->
second
]
=
res_attrs
[
it
->
first
];
}
grad_node
->
SetAttrs
(
attrs
);
}
grad_node
->
SetAttrs
(
attrs
);
}
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
...
...
@@ -514,33 +530,36 @@ static PyObject* eager_api_sparse_coo_tensor(PyObject* self,
auto
non_zero_elements
=
CastPyArg2Tensor
(
PyTuple_GET_ITEM
(
args
,
1
),
1
);
auto
dense_shape
=
CastPyArg2VectorOfInt
(
PyTuple_GET_ITEM
(
args
,
2
),
2
);
auto
stop_gradient
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
3
),
3
);
PADDLE_ENFORCE
(
non_zero_indices
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero indices must be a DenseTensor."
));
PADDLE_ENFORCE
(
non_zero_elements
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero elements must be a DenseTensor."
));
auto
dense_indices
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_indices
.
impl
());
auto
dense_elements
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_elements
.
impl
());
// TODO(zhangkaihuo): After creating SparseCooTensor, call coalesced() to sort
// and merge duplicate indices
std
::
shared_ptr
<
phi
::
SparseCooTensor
>
coo_tensor
=
std
::
make_shared
<
phi
::
SparseCooTensor
>
(
*
dense_indices
,
*
dense_elements
,
phi
::
make_ddim
(
dense_shape
));
paddle
::
experimental
::
Tensor
tensor
;
tensor
.
set_impl
(
coo_tensor
);
auto
name
=
egr
::
Controller
::
Instance
().
GenerateUniqueName
(
"generated_tensor"
);
tensor
.
set_name
(
name
);
auto
autograd_meta
=
egr
::
EagerUtils
::
autograd_meta
(
&
tensor
);
autograd_meta
->
SetStopGradient
(
static_cast
<
bool
>
(
stop_gradient
));
if
(
!
autograd_meta
->
GetMutableGradNode
())
{
VLOG
(
3
)
<<
"Tensor("
<<
name
<<
") doesn't have GradNode, add GradNodeAccumulation to it."
;
autograd_meta
->
SetGradNode
(
std
::
make_shared
<
egr
::
GradNodeAccumulation
>
(
autograd_meta
));
{
eager_gil_scoped_release
guard
;
PADDLE_ENFORCE
(
non_zero_indices
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero indices must be a DenseTensor."
));
PADDLE_ENFORCE
(
non_zero_elements
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero elements must be a DenseTensor."
));
auto
dense_indices
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_indices
.
impl
());
auto
dense_elements
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_elements
.
impl
());
// TODO(zhangkaihuo): After creating SparseCooTensor, call coalesced() to
// sort and merge duplicate indices
std
::
shared_ptr
<
phi
::
SparseCooTensor
>
coo_tensor
=
std
::
make_shared
<
phi
::
SparseCooTensor
>
(
*
dense_indices
,
*
dense_elements
,
phi
::
make_ddim
(
dense_shape
));
tensor
.
set_impl
(
coo_tensor
);
auto
name
=
egr
::
Controller
::
Instance
().
GenerateUniqueName
(
"generated_tensor"
);
tensor
.
set_name
(
name
);
auto
autograd_meta
=
egr
::
EagerUtils
::
autograd_meta
(
&
tensor
);
autograd_meta
->
SetStopGradient
(
static_cast
<
bool
>
(
stop_gradient
));
if
(
!
autograd_meta
->
GetMutableGradNode
())
{
VLOG
(
3
)
<<
"Tensor("
<<
name
<<
") doesn't have GradNode, add GradNodeAccumulation to it."
;
autograd_meta
->
SetGradNode
(
std
::
make_shared
<
egr
::
GradNodeAccumulation
>
(
autograd_meta
));
}
}
return
ToPyObject
(
tensor
);
EAGER_CATCH_AND_THROW_RETURN_NULL
...
...
@@ -555,39 +574,42 @@ static PyObject* eager_api_sparse_csr_tensor(PyObject* self,
auto
non_zero_elements
=
CastPyArg2Tensor
(
PyTuple_GET_ITEM
(
args
,
2
),
2
);
auto
dense_shape
=
CastPyArg2VectorOfInt
(
PyTuple_GET_ITEM
(
args
,
3
),
3
);
auto
stop_gradient
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
4
),
4
);
PADDLE_ENFORCE
(
non_zero_crows
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the compressed non-zero rows must be a DenseTensor."
));
PADDLE_ENFORCE
(
non_zero_cols
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero cols must be a DenseTensor."
));
PADDLE_ENFORCE
(
non_zero_elements
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero elements must be a DenseTensor."
));
auto
dense_crows
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_crows
.
impl
());
auto
dense_cols
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_cols
.
impl
());
auto
dense_elements
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_elements
.
impl
());
std
::
shared_ptr
<
phi
::
SparseCsrTensor
>
csr_tensor
=
std
::
make_shared
<
phi
::
SparseCsrTensor
>
(
*
dense_crows
,
*
dense_cols
,
*
dense_elements
,
phi
::
make_ddim
(
dense_shape
));
paddle
::
experimental
::
Tensor
tensor
;
tensor
.
set_impl
(
csr_tensor
);
auto
name
=
egr
::
Controller
::
Instance
().
GenerateUniqueName
(
"generated_tensor"
);
tensor
.
set_name
(
name
);
auto
autograd_meta
=
egr
::
EagerUtils
::
autograd_meta
(
&
tensor
);
autograd_meta
->
SetStopGradient
(
static_cast
<
bool
>
(
stop_gradient
));
if
(
!
autograd_meta
->
GetMutableGradNode
())
{
VLOG
(
3
)
<<
"Tensor("
<<
name
<<
") have not GradNode, add GradNodeAccumulation for it."
;
autograd_meta
->
SetGradNode
(
std
::
make_shared
<
egr
::
GradNodeAccumulation
>
(
autograd_meta
));
{
eager_gil_scoped_release
guard
;
PADDLE_ENFORCE
(
non_zero_crows
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the compressed non-zero rows must be a DenseTensor."
));
PADDLE_ENFORCE
(
non_zero_cols
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero cols must be a DenseTensor."
));
PADDLE_ENFORCE
(
non_zero_elements
.
is_dense_tensor
(),
paddle
::
platform
::
errors
::
Fatal
(
"the non-zero elements must be a DenseTensor."
));
auto
dense_crows
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_crows
.
impl
());
auto
dense_cols
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_cols
.
impl
());
auto
dense_elements
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
non_zero_elements
.
impl
());
std
::
shared_ptr
<
phi
::
SparseCsrTensor
>
csr_tensor
=
std
::
make_shared
<
phi
::
SparseCsrTensor
>
(
*
dense_crows
,
*
dense_cols
,
*
dense_elements
,
phi
::
make_ddim
(
dense_shape
));
tensor
.
set_impl
(
csr_tensor
);
auto
name
=
egr
::
Controller
::
Instance
().
GenerateUniqueName
(
"generated_tensor"
);
tensor
.
set_name
(
name
);
auto
autograd_meta
=
egr
::
EagerUtils
::
autograd_meta
(
&
tensor
);
autograd_meta
->
SetStopGradient
(
static_cast
<
bool
>
(
stop_gradient
));
if
(
!
autograd_meta
->
GetMutableGradNode
())
{
VLOG
(
3
)
<<
"Tensor("
<<
name
<<
") have not GradNode, add GradNodeAccumulation for it."
;
autograd_meta
->
SetGradNode
(
std
::
make_shared
<
egr
::
GradNodeAccumulation
>
(
autograd_meta
));
}
}
return
ToPyObject
(
tensor
);
EAGER_CATCH_AND_THROW_RETURN_NULL
...
...
@@ -626,87 +648,215 @@ static PyObject* eager_api_async_read(PyObject* self,
auto
&
buffer
=
GetTensorFromArgs
(
"async_read"
,
"buffer"
,
args
,
3
,
false
);
auto
&
offset
=
GetTensorFromArgs
(
"async_read"
,
"offset"
,
args
,
4
,
false
);
auto
&
count
=
GetTensorFromArgs
(
"async_read"
,
"count"
,
args
,
5
,
false
);
PADDLE_ENFORCE_EQ
(
src
.
is_gpu_pinned
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `src` device should be "
"CUDAPinnedPlace, but received %d."
,
src
.
place
()));
PADDLE_ENFORCE_EQ
(
dst
.
is_gpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `dst` device should be CUDAPlace, but received %d."
,
dst
.
place
()));
PADDLE_ENFORCE_EQ
(
index
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `index` device should be CPUPlace, but received %d."
,
index
.
place
()));
PADDLE_ENFORCE_EQ
(
buffer
.
is_gpu_pinned
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `buffer` device should be CUDAPinnedPlace, "
"but received %d."
,
buffer
.
place
()));
PADDLE_ENFORCE_EQ
(
offset
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `offset` device should be CPUPlace, but received %d."
,
offset
.
place
()));
PADDLE_ENFORCE_EQ
(
count
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `count` device should be CPUPlace, but received %d."
,
count
.
place
()));
{
eager_gil_scoped_release
guard
;
PADDLE_ENFORCE_EQ
(
src
.
is_gpu_pinned
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `src` device should be "
"CUDAPinnedPlace, but received %d."
,
src
.
place
()));
PADDLE_ENFORCE_EQ
(
dst
.
is_gpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `dst` device should be CUDAPlace, but received %d."
,
dst
.
place
()));
PADDLE_ENFORCE_EQ
(
index
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `index` device should be CPUPlace, but received %d."
,
index
.
place
()));
PADDLE_ENFORCE_EQ
(
buffer
.
is_gpu_pinned
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `buffer` device should be CUDAPinnedPlace, "
"but received %d."
,
buffer
.
place
()));
PADDLE_ENFORCE_EQ
(
offset
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `offset` device should be CPUPlace, but received %d."
,
offset
.
place
()));
PADDLE_ENFORCE_EQ
(
count
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `count` device should be CPUPlace, but received %d."
,
count
.
place
()));
auto
&
src_tensor
=
src
;
auto
*
dst_tensor
=
&
dst
;
auto
&
index_tensor
=
index
;
auto
*
buffer_tensor
=
&
buffer
;
auto
&
offset_tensor
=
offset
;
auto
&
count_tensor
=
count
;
auto
*
dst_data
=
dst_tensor
->
mutable_data
<
float
>
(
dst
.
place
());
const
auto
&
deviceId
=
paddle
::
platform
::
GetCurrentDeviceId
();
auto
&
src_tensor
=
src
;
auto
*
dst_tensor
=
&
dst
;
auto
&
index_tensor
=
index
;
auto
*
buffer_tensor
=
&
buffer
;
auto
&
offset_tensor
=
offset
;
auto
&
count_tensor
=
count
;
auto
*
dst_data
=
dst_tensor
->
mutable_data
<
float
>
(
dst
.
place
());
const
auto
&
deviceId
=
paddle
::
platform
::
GetCurrentDeviceId
();
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
().
size
(),
dst_tensor
->
dims
().
size
(),
platform
::
errors
::
InvalidArgument
(
"`src` and `dst` should have same tensor shape, "
"except for the first dimension."
));
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
().
size
(),
buffer_tensor
->
dims
().
size
(),
platform
::
errors
::
InvalidArgument
(
"`src` and `buffer` should have same tensor shape, "
"except for the first dimension."
));
for
(
int
i
=
1
;
i
<
src_tensor
.
dims
().
size
();
i
++
)
{
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
()[
i
],
dst_tensor
->
dims
()[
i
],
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
().
size
(),
dst_tensor
->
dims
().
size
(),
platform
::
errors
::
InvalidArgument
(
"`src` and `dst` should have
the
same tensor shape, "
"`src` and `dst` should have same tensor shape, "
"except for the first dimension."
));
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
()[
i
],
buffer_tensor
->
dims
()[
i
],
platform
::
errors
::
InvalidArgument
(
"`src` and `buffer` should have the same tensor shape, "
"except for the first dimension."
));
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
().
size
(),
buffer_tensor
->
dims
().
size
(),
platform
::
errors
::
InvalidArgument
(
"`src` and `buffer` should have same tensor shape, "
"except for the first dimension."
));
for
(
int
i
=
1
;
i
<
src_tensor
.
dims
().
size
();
i
++
)
{
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
()[
i
],
dst_tensor
->
dims
()[
i
],
platform
::
errors
::
InvalidArgument
(
"`src` and `dst` should have the same tensor shape, "
"except for the first dimension."
));
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
()[
i
],
buffer_tensor
->
dims
()[
i
],
platform
::
errors
::
InvalidArgument
(
"`src` and `buffer` should have the same tensor shape, "
"except for the first dimension."
));
}
PADDLE_ENFORCE_EQ
(
index_tensor
.
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"`index` tensor should be one-dimensional."
));
auto
stream
=
paddle
::
platform
::
get_current_stream
(
deviceId
)
->
raw_stream
();
int64_t
numel
=
0
;
// total copy length
int64_t
copy_flag
=
offset_tensor
.
dims
()[
0
];
int64_t
size
=
src_tensor
.
numel
()
/
src_tensor
.
dims
()[
0
];
if
(
copy_flag
!=
0
)
{
PADDLE_ENFORCE_EQ
(
offset_tensor
.
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"`offset` tensor should be one-dimensional."
));
PADDLE_ENFORCE_EQ
(
count_tensor
.
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"`count` tensor should be one-dimensional."
));
PADDLE_ENFORCE_EQ
(
offset_tensor
.
numel
(),
count_tensor
.
numel
(),
platform
::
errors
::
InvalidArgument
(
"`offset` and `count` tensor size dismatch."
));
auto
*
offset_data
=
offset_tensor
.
data
<
int64_t
>
();
auto
*
count_data
=
count_tensor
.
data
<
int64_t
>
();
for
(
int64_t
i
=
0
;
i
<
count_tensor
.
numel
();
i
++
)
{
numel
+=
count_data
[
i
];
}
PADDLE_ENFORCE_LE
(
numel
+
index_tensor
.
numel
(),
buffer_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Buffer tensor size is too small."
));
PADDLE_ENFORCE_LE
(
numel
+
index_tensor
.
numel
(),
dst_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Target tensor size is too small."
));
int64_t
src_offset
,
dst_offset
=
0
,
c
;
auto
*
src_data
=
src_tensor
.
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
offset_tensor
.
numel
();
i
++
)
{
src_offset
=
offset_data
[
i
],
c
=
count_data
[
i
];
PADDLE_ENFORCE_LE
(
src_offset
+
c
,
src_tensor
.
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index."
));
PADDLE_ENFORCE_LE
(
dst_offset
+
c
,
dst_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index."
));
cudaMemcpyAsync
(
dst_data
+
(
dst_offset
*
size
),
src_data
+
(
src_offset
*
size
),
c
*
size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
stream
);
dst_offset
+=
c
;
}
}
else
{
PADDLE_ENFORCE_LE
(
index_tensor
.
numel
(),
buffer_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Buffer tensor size is too small."
));
}
// Select the index data to the buffer
auto
index_select
=
[](
const
paddle
::
experimental
::
Tensor
&
src_tensor
,
const
paddle
::
experimental
::
Tensor
&
index_tensor
,
paddle
::
experimental
::
Tensor
*
buffer_tensor
)
{
auto
*
src_data
=
src_tensor
.
data
<
float
>
();
auto
*
index_data
=
index_tensor
.
data
<
int64_t
>
();
auto
*
buffer_data
=
buffer_tensor
->
data
<
float
>
();
const
int
&
slice_size
=
src_tensor
.
numel
()
/
src_tensor
.
dims
()[
0
];
const
int
&
copy_bytes
=
slice_size
*
sizeof
(
float
);
int64_t
c
=
0
;
for
(
int64_t
i
=
0
;
i
<
index_tensor
.
numel
();
i
++
)
{
std
::
memcpy
(
buffer_data
+
c
*
slice_size
,
src_data
+
index_data
[
i
]
*
slice_size
,
copy_bytes
);
c
+=
1
;
}
};
index_select
(
src_tensor
,
index_tensor
,
buffer_tensor
);
// Copy the data to device memory
cudaMemcpyAsync
(
dst_data
+
(
numel
*
size
),
buffer_tensor
->
data
<
float
>
(),
index_tensor
.
numel
()
*
size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
stream
);
}
PADDLE_ENFORCE_EQ
(
index_tensor
.
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"`index` tensor should be one-dimensional."
));
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
}
auto
stream
=
paddle
::
platform
::
get_current_stream
(
deviceId
)
->
raw_stream
();
static
PyObject
*
eager_api_async_write
(
PyObject
*
self
,
PyObject
*
args
,
PyObject
*
kwargs
)
{
EAGER_TRY
auto
&
src
=
GetTensorFromArgs
(
"async_write"
,
"src"
,
args
,
0
,
false
);
auto
&
dst
=
GetTensorFromArgs
(
"async_write"
,
"dst"
,
args
,
1
,
false
);
auto
&
offset
=
GetTensorFromArgs
(
"async_write"
,
"offset"
,
args
,
2
,
false
);
auto
&
count
=
GetTensorFromArgs
(
"async_write"
,
"count"
,
args
,
3
,
false
);
{
eager_gil_scoped_release
guard
;
PADDLE_ENFORCE_EQ
(
src
.
is_gpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `src` device should be CUDAPlace, but received %d. "
,
src
.
place
()));
PADDLE_ENFORCE_EQ
(
dst
.
is_gpu_pinned
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `dst` device should be CUDAPinnedPlace, "
"but received %d. "
,
dst
.
place
()));
PADDLE_ENFORCE_EQ
(
offset
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `offset` device should "
"be CPUPlace, but received %d. "
,
offset
.
place
()));
PADDLE_ENFORCE_EQ
(
count
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `count` device should be CPUPlace, but received %d. "
,
count
.
place
()));
int64_t
numel
=
0
;
// total copy length
int64_t
copy_flag
=
offset_tensor
.
dims
()[
0
];
int64_t
size
=
src_tensor
.
numel
()
/
src_tensor
.
dims
()[
0
];
// TODO(daisiming): In future, add index as arguments following
// async_read.
auto
&
src_tensor
=
src
;
auto
*
dst_tensor
=
&
dst
;
auto
&
offset_tensor
=
offset
;
auto
&
count_tensor
=
count
;
const
auto
&
deviceId
=
paddle
::
platform
::
GetCurrentDeviceId
();
if
(
copy_flag
!=
0
)
{
PADDLE_ENFORCE_EQ
(
offset_tensor
.
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
...
...
@@ -719,165 +869,45 @@ static PyObject* eager_api_async_read(PyObject* self,
count_tensor
.
numel
(),
platform
::
errors
::
InvalidArgument
(
"`offset` and `count` tensor size dismatch."
));
auto
*
offset_data
=
offset_tensor
.
data
<
int64_t
>
();
auto
*
count_data
=
count_tensor
.
data
<
int64_t
>
();
for
(
int64_t
i
=
0
;
i
<
count_tensor
.
numel
();
i
++
)
{
numel
+=
count_data
[
i
];
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
().
size
(),
dst_tensor
->
dims
().
size
(),
platform
::
errors
::
InvalidArgument
(
"`src` and `dst` should have the same tensor shape, "
"except for the first dimension."
));
for
(
int
i
=
1
;
i
<
src_tensor
.
dims
().
size
();
i
++
)
{
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
()[
i
],
dst_tensor
->
dims
()[
i
],
platform
::
errors
::
InvalidArgument
(
"`src` and `dst` should have the same tensor shape, "
"except for the first dimension."
));
}
PADDLE_ENFORCE_LE
(
numel
+
index_tensor
.
numel
(),
buffer_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Buffer tensor size is too small."
));
PADDLE_ENFORCE_LE
(
numel
+
index_tensor
.
numel
(),
dst_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Target tensor size is too small."
));
int64_t
src_offset
,
dst_offset
=
0
,
c
;
auto
stream
=
paddle
::
platform
::
get_current_stream
(
deviceId
)
->
raw_stream
();
int64_t
size
=
src_tensor
.
numel
()
/
src_tensor
.
dims
()[
0
];
auto
*
src_data
=
src_tensor
.
data
<
float
>
();
auto
*
dst_data
=
dst_tensor
->
data
<
float
>
();
const
int64_t
*
offset_data
=
offset_tensor
.
data
<
int64_t
>
();
const
int64_t
*
count_data
=
count_tensor
.
data
<
int64_t
>
();
int64_t
src_offset
=
0
,
dst_offset
,
c
;
for
(
int64_t
i
=
0
;
i
<
offset_tensor
.
numel
();
i
++
)
{
src
_offset
=
offset_data
[
i
],
c
=
count_data
[
i
];
dst
_offset
=
offset_data
[
i
],
c
=
count_data
[
i
];
PADDLE_ENFORCE_LE
(
src_offset
+
c
,
src_tensor
.
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index
.
"
));
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index"
));
PADDLE_ENFORCE_LE
(
dst_offset
+
c
,
dst_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index
.
"
));
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index"
));
cudaMemcpyAsync
(
dst_data
+
(
dst_offset
*
size
),
src_data
+
(
src_offset
*
size
),
c
*
size
*
sizeof
(
float
),
cudaMemcpy
HostToDevice
,
cudaMemcpy
DeviceToHost
,
stream
);
dst_offset
+=
c
;
}
}
else
{
PADDLE_ENFORCE_LE
(
index_tensor
.
numel
(),
buffer_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Buffer tensor size is too small."
));
}
// Select the index data to the buffer
auto
index_select
=
[](
const
paddle
::
experimental
::
Tensor
&
src_tensor
,
const
paddle
::
experimental
::
Tensor
&
index_tensor
,
paddle
::
experimental
::
Tensor
*
buffer_tensor
)
{
auto
*
src_data
=
src_tensor
.
data
<
float
>
();
auto
*
index_data
=
index_tensor
.
data
<
int64_t
>
();
auto
*
buffer_data
=
buffer_tensor
->
data
<
float
>
();
const
int
&
slice_size
=
src_tensor
.
numel
()
/
src_tensor
.
dims
()[
0
];
const
int
&
copy_bytes
=
slice_size
*
sizeof
(
float
);
int64_t
c
=
0
;
for
(
int64_t
i
=
0
;
i
<
index_tensor
.
numel
();
i
++
)
{
std
::
memcpy
(
buffer_data
+
c
*
slice_size
,
src_data
+
index_data
[
i
]
*
slice_size
,
copy_bytes
);
c
+=
1
;
src_offset
+=
c
;
}
};
index_select
(
src_tensor
,
index_tensor
,
buffer_tensor
);
// Copy the data to device memory
cudaMemcpyAsync
(
dst_data
+
(
numel
*
size
),
buffer_tensor
->
data
<
float
>
(),
index_tensor
.
numel
()
*
size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
stream
);
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static
PyObject
*
eager_api_async_write
(
PyObject
*
self
,
PyObject
*
args
,
PyObject
*
kwargs
)
{
EAGER_TRY
auto
&
src
=
GetTensorFromArgs
(
"async_write"
,
"src"
,
args
,
0
,
false
);
auto
&
dst
=
GetTensorFromArgs
(
"async_write"
,
"dst"
,
args
,
1
,
false
);
auto
&
offset
=
GetTensorFromArgs
(
"async_write"
,
"offset"
,
args
,
2
,
false
);
auto
&
count
=
GetTensorFromArgs
(
"async_write"
,
"count"
,
args
,
3
,
false
);
PADDLE_ENFORCE_EQ
(
src
.
is_gpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `src` device should be CUDAPlace, but received %d. "
,
src
.
place
()));
PADDLE_ENFORCE_EQ
(
dst
.
is_gpu_pinned
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `dst` device should be CUDAPinnedPlace, "
"but received %d. "
,
dst
.
place
()));
PADDLE_ENFORCE_EQ
(
offset
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `offset` device should "
"be CPUPlace, but received %d. "
,
offset
.
place
()));
PADDLE_ENFORCE_EQ
(
count
.
is_cpu
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Required `count` device should be CPUPlace, but received %d. "
,
count
.
place
()));
// TODO(daisiming): In future, add index as arguments following
// async_read.
auto
&
src_tensor
=
src
;
auto
*
dst_tensor
=
&
dst
;
auto
&
offset_tensor
=
offset
;
auto
&
count_tensor
=
count
;
const
auto
&
deviceId
=
paddle
::
platform
::
GetCurrentDeviceId
();
PADDLE_ENFORCE_EQ
(
offset_tensor
.
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"`offset` tensor should be one-dimensional."
));
PADDLE_ENFORCE_EQ
(
count_tensor
.
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"`count` tensor should be one-dimensional."
));
PADDLE_ENFORCE_EQ
(
offset_tensor
.
numel
(),
count_tensor
.
numel
(),
platform
::
errors
::
InvalidArgument
(
"`offset` and `count` tensor size dismatch."
));
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
().
size
(),
dst_tensor
->
dims
().
size
(),
platform
::
errors
::
InvalidArgument
(
"`src` and `dst` should have the same tensor shape, "
"except for the first dimension."
));
for
(
int
i
=
1
;
i
<
src_tensor
.
dims
().
size
();
i
++
)
{
PADDLE_ENFORCE_EQ
(
src_tensor
.
dims
()[
i
],
dst_tensor
->
dims
()[
i
],
platform
::
errors
::
InvalidArgument
(
"`src` and `dst` should have the same tensor shape, "
"except for the first dimension."
));
}
auto
stream
=
paddle
::
platform
::
get_current_stream
(
deviceId
)
->
raw_stream
();
int64_t
size
=
src_tensor
.
numel
()
/
src_tensor
.
dims
()[
0
];
auto
*
src_data
=
src_tensor
.
data
<
float
>
();
auto
*
dst_data
=
dst_tensor
->
data
<
float
>
();
const
int64_t
*
offset_data
=
offset_tensor
.
data
<
int64_t
>
();
const
int64_t
*
count_data
=
count_tensor
.
data
<
int64_t
>
();
int64_t
src_offset
=
0
,
dst_offset
,
c
;
for
(
int64_t
i
=
0
;
i
<
offset_tensor
.
numel
();
i
++
)
{
dst_offset
=
offset_data
[
i
],
c
=
count_data
[
i
];
PADDLE_ENFORCE_LE
(
src_offset
+
c
,
src_tensor
.
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index"
));
PADDLE_ENFORCE_LE
(
dst_offset
+
c
,
dst_tensor
->
dims
()[
0
],
platform
::
errors
::
InvalidArgument
(
"Invalid offset or count index"
));
cudaMemcpyAsync
(
dst_data
+
(
dst_offset
*
size
),
src_data
+
(
src_offset
*
size
),
c
*
size
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
,
stream
);
src_offset
+=
c
;
}
RETURN_PY_NONE
EAGER_CATCH_AND_THROW_RETURN_NULL
...
...
@@ -929,7 +959,6 @@ static PyObject* eager_api_to_uva_tensor(PyObject* self,
"float64, int8, int16, int32, int64,"
"please check your input or input array data type."
));
}
return
ToPyObject
(
*
(
new_tensor
.
get
()));
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
paddle/fluid/pybind/eager_method.cc
浏览文件 @
b106c424
...
...
@@ -156,6 +156,7 @@ static PyObject* tensor_method_numpy(TensorObject* self,
}
if
(
self
->
tensor
.
is_cpu
()
||
self
->
tensor
.
is_gpu_pinned
())
{
eager_gil_scoped_release
guard
;
platform
::
CPUPlace
place
;
if
(
self
->
tensor
.
is_selected_rows
())
{
VLOG
(
6
)
<<
"Getting SelectedRows's numpy value"
;
...
...
@@ -186,6 +187,7 @@ static PyObject* tensor_method_numpy(TensorObject* self,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
}
else
if
(
self
->
tensor
.
is_gpu
())
{
eager_gil_scoped_release
guard
;
#if defined(PADDLE_WITH_CUDA)
gpuMemcpyKind
kind
=
cudaMemcpyDeviceToHost
;
#elif defined(PADDLE_WITH_HIP)
...
...
@@ -244,6 +246,7 @@ static PyObject* tensor_method_numpy(TensorObject* self,
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
}
else
if
(
self
->
tensor
.
is_custom_device
())
{
eager_gil_scoped_release
guard
;
if
(
self
->
tensor
.
is_selected_rows
())
{
VLOG
(
6
)
<<
"Getting SelectedRows's numpy value"
;
auto
*
selected_rows
=
...
...
@@ -311,8 +314,8 @@ static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self,
const
auto
*
st_ptr
=
string_tensor
->
data
();
auto
numel
=
self
->
tensor
.
numel
();
auto
tensor_dims
=
self
->
tensor
.
shape
();
// Get the max unicode length of StringTensor to create numpy unicode
string
// array.
// Get the max unicode length of StringTensor to create numpy unicode
//
string
array.
auto
*
longest_pstring
=
std
::
max_element
(
st_ptr
,
st_ptr
+
numel
,
[](
const
auto
&
a
,
const
auto
&
b
)
{
auto
a_unicode_len
=
...
...
@@ -394,14 +397,18 @@ static PyObject* tensor_method__copy_to(TensorObject* self,
EAGER_TRY
auto
place
=
CastPyArg2Place
(
PyTuple_GET_ITEM
(
args
,
0
),
0
);
bool
blocking
=
CastPyArg2AttrBoolean
(
PyTuple_GET_ITEM
(
args
,
1
),
1
);
auto
cp_tensor
=
self
->
tensor
.
copy_to
(
place
,
blocking
);
if
(
!
blocking
)
{
IncreaseTensorReferenceCountUntilCopyComplete
(
self
->
tensor
,
place
);
paddle
::
experimental
::
Tensor
cp_tensor
;
{
eager_gil_scoped_release
guard
;
cp_tensor
=
self
->
tensor
.
copy_to
(
place
,
blocking
);
if
(
!
blocking
)
{
IncreaseTensorReferenceCountUntilCopyComplete
(
self
->
tensor
,
place
);
}
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetStopGradient
(
true
);
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetPersistable
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
self
->
tensor
))
->
Persistable
());
}
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetStopGradient
(
true
);
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetPersistable
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
self
->
tensor
))
->
Persistable
());
return
ToPyObject
(
cp_tensor
);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
@@ -410,11 +417,15 @@ static PyObject* tensor_method_cpu(TensorObject* self,
PyObject
*
args
,
PyObject
*
kwargs
)
{
EAGER_TRY
auto
cp_tensor
=
self
->
tensor
.
copy_to
(
phi
::
CPUPlace
(),
true
);
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetStopGradient
(
true
);
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetPersistable
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
self
->
tensor
))
->
Persistable
());
paddle
::
experimental
::
Tensor
cp_tensor
;
{
eager_gil_scoped_release
guard
;
cp_tensor
=
self
->
tensor
.
copy_to
(
phi
::
CPUPlace
(),
true
);
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetStopGradient
(
true
);
egr
::
EagerUtils
::
autograd_meta
(
&
cp_tensor
)
->
SetPersistable
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
self
->
tensor
))
->
Persistable
());
}
return
ToPyObject
(
cp_tensor
);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
@@ -450,6 +461,7 @@ static PyObject* tensor_method_copy_(TensorObject* self,
VLOG
(
6
)
<<
"Start Copy Tensor "
<<
src_tensor
.
name
()
<<
" to "
<<
self
->
tensor
.
name
();
if
(
!
self
->
tensor
.
initialized
())
{
eager_gil_scoped_release
guard
;
egr
::
EagerUtils
::
autograd_meta
(
&
(
self
->
tensor
))
->
SetStopGradient
(
egr
::
EagerUtils
::
autograd_meta
(
&
(
src_tensor
))
->
StopGradient
());
...
...
@@ -461,6 +473,7 @@ static PyObject* tensor_method_copy_(TensorObject* self,
}
}
else
{
if
(
src_tensor
.
initialized
())
{
eager_gil_scoped_release
guard
;
self
->
tensor
.
copy_
(
src_tensor
,
self
->
tensor
.
place
(),
blocking
);
}
}
...
...
@@ -476,16 +489,19 @@ static PyObject* tensor_method_clone(TensorObject* self,
PyObject
*
args
,
PyObject
*
kwargs
)
{
EAGER_TRY
paddle
::
experimental
::
Tensor
out
;
{
eager_gil_scoped_release
guard
;
PADDLE_ENFORCE_EQ
(
self
->
tensor
.
initialized
(),
true
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"We can only support initialized tensor in clone, however we got "
"uninitialized tensor %s, please check your code."
,
self
->
tensor
.
name
()));
PADDLE_ENFORCE_EQ
(
self
->
tensor
.
initialized
(),
true
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"We can only support initialized tensor in clone, however we got "
"uninitialized tensor %s, please check your code."
,
self
->
tensor
.
name
()));
auto
out
=
assign_ad_func
(
self
->
tensor
);
out
=
assign_ad_func
(
self
->
tensor
);
}
return
ToPyObject
(
out
);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
...
...
@@ -495,6 +511,7 @@ static PyObject* tensor_retain_grads(TensorObject* self,
PyObject
*
kwargs
)
{
EAGER_TRY
if
(
egr
::
Controller
::
Instance
().
HasGrad
())
{
eager_gil_scoped_release
guard
;
auto
meta
=
egr
::
EagerUtils
::
autograd_meta
(
&
(
self
->
tensor
));
if
(
!
meta
->
GetMutableGradNode
())
{
VLOG
(
6
)
<<
"Make grad node of tensor: "
<<
self
->
tensor
.
name
()
...
...
@@ -535,6 +552,7 @@ static PyObject* tensor_clear_gradient(TensorObject* self,
}
if
(
grad
->
impl
())
{
eager_gil_scoped_release
guard
;
if
(
grad
->
is_selected_rows
())
{
auto
selected_rows
=
std
::
dynamic_pointer_cast
<
phi
::
SelectedRows
>
(
grad
->
impl
());
...
...
@@ -577,6 +595,7 @@ static PyObject* tensor__zero_grads(TensorObject* self,
VLOG
(
4
)
<<
"ZeroGrads "
<<
self
->
tensor
.
name
();
if
(
egr
::
egr_utils_api
::
IsLeafTensor
(
self
->
tensor
))
{
eager_gil_scoped_release
guard
;
// Add RetainGrad as PostHook to AccumulationNode
paddle
::
experimental
::
Tensor
*
grad
=
egr
::
EagerUtils
::
mutable_grad
(
self
->
tensor
);
...
...
@@ -595,6 +614,7 @@ static PyObject* tensor__zero_grads(TensorObject* self,
}
}
}
else
{
eager_gil_scoped_release
guard
;
auto
meta
=
egr
::
EagerUtils
::
unsafe_autograd_meta
(
self
->
tensor
);
if
(
meta
->
MutableGrad
()
->
initialized
())
{
if
(
meta
->
MutableGrad
()
->
is_dense_tensor
())
{
...
...
@@ -855,6 +875,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
decrease_axis
.
end
());
if
(
op_type
==
"slice"
)
{
eager_gil_scoped_release
guard
;
out
=
slice_ad_func
(
self
->
tensor
,
slice_axes_tmp
,
slice_starts
,
...
...
@@ -862,6 +883,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
infer_flags_tmp
,
decrease_axis_tmp
);
}
else
if
(
op_type
==
"strided_slice"
)
{
eager_gil_scoped_release
guard
;
out
=
strided_slice_ad_func
(
self
->
tensor
,
slice_axes
,
slice_starts
,
slice_ends
,
slice_strides
);
}
else
{
...
...
@@ -886,28 +908,31 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
none_axes
.
pop_back
();
}
if
(
!
none_axes
.
empty
())
{
// Deal with cases that decrease_axes is not empty
// For example:
// # x.shape: (2,3,4)
// out = x[0, 0:2, None] # out.shape : (2, 1, 4)
for
(
auto
&
axis
:
none_axes
)
{
int
len
=
0
;
for
(
int
da
:
decrease_axis
)
{
if
(
da
<
axis
)
{
len
++
;
paddle
::
experimental
::
Tensor
new_out
;
{
eager_gil_scoped_release
guard
;
// Deal with cases that decrease_axes is not empty
// For example:
// # x.shape: (2,3,4)
// out = x[0, 0:2, None] # out.shape : (2, 1, 4)
for
(
auto
&
axis
:
none_axes
)
{
int
len
=
0
;
for
(
int
da
:
decrease_axis
)
{
if
(
da
<
axis
)
{
len
++
;
}
}
axis
-=
len
;
}
axis
-=
len
;
new_out
=
unsqueeze_ad_func
(
out
,
none_axes
)
;
}
paddle
::
experimental
::
Tensor
new_out
;
new_out
=
unsqueeze_ad_func
(
out
,
none_axes
);
return
ToPyObject
(
new_out
);
}
}
// the index is a list
if
(
list_select_flag
)
{
eager_gil_scoped_release
guard
;
auto
select_index
=
paddle
::
experimental
::
Tensor
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
());
auto
idx_tensor
=
std
::
make_shared
<
phi
::
DenseTensor
>
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录