Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b9d7bd48
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b9d7bd48
编写于
9月 21, 2018
作者:
T
tensor-tang
提交者:
GitHub
9月 21, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into remove/kwargs
上级
6d2ce745
6537b175
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
605 addition
and
432 deletion
+605
-432
paddle/fluid/API.spec
paddle/fluid/API.spec
+6
-6
paddle/fluid/framework/details/cow_ptr.h
paddle/fluid/framework/details/cow_ptr.h
+61
-23
paddle/fluid/framework/details/cow_ptr_test.cc
paddle/fluid/framework/details/cow_ptr_test.cc
+0
-8
paddle/fluid/framework/details/reference_count_op_handle.h
paddle/fluid/framework/details/reference_count_op_handle.h
+28
-13
paddle/fluid/framework/details/reference_count_pass.cc
paddle/fluid/framework/details/reference_count_pass.cc
+64
-11
paddle/fluid/framework/mixed_vector.h
paddle/fluid/framework/mixed_vector.h
+241
-326
paddle/fluid/operators/adam_op.h
paddle/fluid/operators/adam_op.h
+31
-13
paddle/fluid/operators/detection_map_op.h
paddle/fluid/operators/detection_map_op.h
+13
-15
paddle/fluid/operators/extract_rows_op.cc
paddle/fluid/operators/extract_rows_op.cc
+1
-1
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+6
-4
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+1
-0
python/paddle/fluid/initializer.py
python/paddle/fluid/initializer.py
+2
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+148
-0
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+0
-6
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+1
-1
python/paddle/fluid/tests/unittests/test_detection_map_op.py
python/paddle/fluid/tests/unittests/test_detection_map_op.py
+2
-3
未找到文件。
paddle/fluid/API.spec
浏览文件 @
b9d7bd48
...
...
@@ -160,6 +160,12 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.stanh ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None))
paddle.fluid.layers.hard_sigmoid ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None))
paddle.fluid.layers.swish ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.brelu ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None))
paddle.fluid.layers.leaky_relu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None))
...
...
@@ -260,12 +266,6 @@ paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', de
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.relu6 ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.stanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.hard_sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.swish ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/details/cow_ptr.h
浏览文件 @
b9d7bd48
...
...
@@ -20,41 +20,79 @@ namespace paddle {
namespace
framework
{
namespace
details
{
template
<
class
T
>
class
COWPtr
{
// Change it to thread safe flags if needed.
class
ThreadUnsafeOwnershipFlags
{
public:
typedef
std
::
shared_ptr
<
T
>
RefPtr
;
explicit
ThreadUnsafeOwnershipFlags
(
bool
flag
)
:
flag_
(
flag
)
{}
private:
RefPtr
m_sp
;
ThreadUnsafeOwnershipFlags
(
const
ThreadUnsafeOwnershipFlags
&
other
)
=
delete
;
ThreadUnsafeOwnershipFlags
&
operator
=
(
const
ThreadUnsafeOwnershipFlags
&
other
)
=
delete
;
ThreadUnsafeOwnershipFlags
(
ThreadUnsafeOwnershipFlags
&&
other
)
=
default
;
void
detach
()
{
T
*
tmp
=
m_sp
.
get
();
if
(
!
(
tmp
==
nullptr
||
m_sp
.
unique
()))
{
m_sp
=
RefPtr
(
new
T
(
*
tmp
));
void
SetOwnership
(
bool
flag
)
{
flag_
=
flag
;
}
// Invoke the callback if it is not owned.
template
<
typename
Callback
>
void
AcquireOwnershipOnce
(
Callback
acquire
)
{
if
(
!
flag_
)
{
acquire
();
flag_
=
true
;
}
}
public:
COWPtr
()
:
m_sp
(
nullptr
)
{}
explicit
COWPtr
(
T
*
t
)
:
m_sp
(
t
)
{}
explicit
COWPtr
(
const
RefPtr
&
refptr
)
:
m_sp
(
refptr
)
{}
private:
bool
flag_
;
};
const
T
&
Data
()
const
{
return
operator
*
();
}
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template
<
typename
T
,
typename
OwnershipFlags
=
ThreadUnsafeOwnershipFlags
>
class
COWPtr
{
public:
// Ctor from raw pointer.
explicit
COWPtr
(
T
*
ptr
)
:
payload_
(
ptr
),
ownership_
{
true
}
{}
T
*
MutableData
()
{
return
operator
->
();
}
// Move methods. Steal ownership from origin
COWPtr
(
COWPtr
&&
other
)
:
payload_
(
other
.
payload_
),
ownership_
{
std
::
move
(
other
.
ownership_
)}
{}
COWPtr
&
operator
=
(
COWPtr
&&
origin
)
=
default
;
const
T
&
operator
*
()
const
{
return
*
m_sp
;
}
T
&
operator
*
()
{
detach
();
return
*
m_sp
;
// Copy methods. Not own payload
COWPtr
(
const
COWPtr
&
other
)
:
payload_
(
other
.
payload_
),
ownership_
{
false
}
{}
COWPtr
&
operator
=
(
const
COWPtr
&
other
)
{
payload_
=
other
.
payload_
;
ownership_
.
SetOwnership
(
false
);
return
*
this
;
}
const
T
*
operator
->
()
const
{
return
m_sp
.
operator
->
();
}
T
*
operator
->
()
{
detach
();
return
m_sp
.
operator
->
();
// Access read only data.
const
T
&
Data
()
const
{
return
*
payload_
;
}
// Access mutable data. If the data is not owned, the data will be copied
// before.
T
*
MutableData
()
{
ownership_
.
AcquireOwnershipOnce
(
[
this
]
{
payload_
.
reset
(
new
T
(
*
payload_
));
});
return
payload_
.
get
();
}
private:
// Actual data pointer.
std
::
shared_ptr
<
T
>
payload_
;
// Ownership flag.
OwnershipFlags
ownership_
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/cow_ptr_test.cc
浏览文件 @
b9d7bd48
...
...
@@ -30,14 +30,6 @@ TEST(COWPtr, all) {
ASSERT_EQ
(
ptr2
.
Data
(),
10
);
}
TEST
(
COWPtr
,
change_old
)
{
COWPtr
<
int
>
ptr
(
new
int
{
0
});
COWPtr
<
int
>
ptr2
=
ptr
;
*
ptr
.
MutableData
()
=
10
;
ASSERT_EQ
(
ptr2
.
Data
(),
0
);
ASSERT_EQ
(
ptr
.
Data
(),
10
);
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/reference_count_op_handle.h
浏览文件 @
b9d7bd48
...
...
@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
...
...
@@ -46,17 +47,15 @@ class ReferenceCountOpHandle : public OpHandleBase {
const
std
::
vector
<
std
::
string
>
&
var_names
,
GarbageCollector
<
Tensor
>
*
gc
,
AtomicReferenceCountMap
*
ref_cnts
)
:
OpHandleBase
(
node
),
scope_
(
scope
),
var_names_
(
var_names
),
gc_
(
gc
),
ref_cnts_
(
ref_cnts
)
{
:
OpHandleBase
(
node
),
scope_
(
scope
),
gc_
(
gc
),
ref_cnts_
(
ref_cnts
)
{
dev_ctx_
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
IsStreamGarabageCollector
())
{
PADDLE_ENFORCE
(
cudaSetDevice
(
place
.
device
));
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event_
,
cudaEventDisableTiming
));
}
for
(
auto
&
name
:
var_names
)
AddVar
(
name
);
}
~
ReferenceCountOpHandle
()
{
...
...
@@ -69,19 +68,35 @@ class ReferenceCountOpHandle : public OpHandleBase {
std
::
string
Name
()
const
override
{
return
"reference_count"
;
}
void
AddVar
(
const
std
::
string
&
name
)
{
auto
it
=
var_names_
.
find
(
name
);
if
(
it
!=
var_names_
.
end
())
++
(
it
->
second
);
else
var_names_
[
name
]
=
1
;
}
protected:
void
RunImpl
()
override
{
auto
*
exec_scope
=
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
std
::
vector
<
LoDTensor
*>
tensors
;
for
(
auto
&
name
:
var_names_
)
{
std
::
vector
<
Tensor
*>
tensors
;
for
(
auto
&
pair
:
var_names_
)
{
auto
&
name
=
pair
.
first
;
auto
it
=
ref_cnts_
->
find
(
name
);
if
(
it
==
ref_cnts_
->
end
())
continue
;
auto
*
var
=
exec_scope
->
FindVar
(
name
);
if
(
var
==
nullptr
||
!
var
->
IsType
<
LoDTensor
>
())
continue
;
if
(
it
->
second
.
fetch_sub
(
1
)
<=
1
)
{
tensors
.
emplace_back
(
var
->
GetMutable
<
LoDTensor
>
());
if
(
var
==
nullptr
)
continue
;
if
(
var
->
IsType
<
LoDTensor
>
())
{
if
(
it
->
second
.
fetch_sub
(
pair
.
second
)
<=
pair
.
second
)
{
tensors
.
emplace_back
(
var
->
GetMutable
<
LoDTensor
>
());
}
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
if
(
it
->
second
.
fetch_sub
(
pair
.
second
)
<=
pair
.
second
)
{
tensors
.
emplace_back
(
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
());
}
}
}
...
...
@@ -91,7 +106,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
}
private:
void
ClearTensors
(
const
std
::
vector
<
LoD
Tensor
*>
&
tensors
)
{
void
ClearTensors
(
const
std
::
vector
<
Tensor
*>
&
tensors
)
{
auto
*
gc
=
dynamic_cast
<
StreamGarbageCollector
<
Tensor
>
*>
(
gc_
);
if
(
gc
!=
nullptr
)
{
auto
compute_stream
=
dev_ctx_
->
stream
();
...
...
@@ -112,7 +127,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
const
Scope
*
scope_
;
platform
::
CUDADeviceContext
*
dev_ctx_
;
std
::
vector
<
std
::
string
>
var_names_
;
std
::
unordered_map
<
std
::
string
,
int
>
var_names_
;
GarbageCollector
<
Tensor
>
*
gc_
;
// not own
AtomicReferenceCountMap
*
ref_cnts_
;
// not own
cudaEvent_t
event_
;
...
...
paddle/fluid/framework/details/reference_count_pass.cc
浏览文件 @
b9d7bd48
...
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <queue>
#include <string>
#include <vector>
...
...
@@ -23,6 +24,25 @@ namespace paddle {
namespace
framework
{
namespace
details
{
static
ComputationOpHandle
*
FindNextComputationOpHandle
(
VarHandle
*
var_in
)
{
std
::
queue
<
VarHandleBase
*>
queue
;
queue
.
push
(
var_in
);
do
{
auto
*
var
=
queue
.
front
();
queue
.
pop
();
for
(
auto
*
op
:
var
->
PendingOps
())
{
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place_
)
{
return
compute_op
;
}
for
(
auto
*
out_var
:
op
->
Outputs
())
{
queue
.
push
(
out_var
);
}
}
}
while
(
!
queue
.
empty
());
return
nullptr
;
}
std
::
unique_ptr
<
ir
::
Graph
>
ReferenceCountPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
&
ref_cnts
=
Get
<
DeviceReferenceCountMap
>
(
kGlobalReferenceCount
);
...
...
@@ -34,6 +54,9 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
// Step 2: Find all variables in non-computation ops which refers to variables
// in computation ops
std
::
unordered_set
<
std
::
string
>
names
;
std
::
unordered_map
<
OpHandleBase
*
,
std
::
unique_ptr
<
ReferenceCountOpHandle
>>
compute_ref_cnt_map
;
auto
get_ref_cnts_from_compute_op
=
[
&
](
const
std
::
unique_ptr
<
OpHandleBase
>
&
op
,
const
std
::
vector
<
VarHandleBase
*>
&
vars
)
{
...
...
@@ -54,15 +77,18 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
VarDesc
*
var_desc
=
var_handle
->
Node
()
->
Var
();
auto
var_name
=
var_handle
->
Node
()
->
Name
();
// This is w
ie
rd but there is really some variables without var_desc
// This is w
ei
rd but there is really some variables without var_desc
// in computation_op
if
(
var_desc
==
nullptr
)
{
if
(
compute_op
->
Node
()
->
Op
()
->
Block
()
->
FindVar
(
var_name
)
==
nullptr
)
continue
;
}
else
{
if
(
var_desc
->
Persistable
()
||
var_desc
->
Proto
()
->
type
().
type
()
!=
proto
::
VarType
::
LOD_TENSOR
)
if
(
var_desc
->
Persistable
())
continue
;
auto
var_type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
var_type
!=
proto
::
VarType
::
LOD_TENSOR
&&
var_type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
continue
;
}
}
// compute op only runs in one device
...
...
@@ -93,12 +119,33 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
if
(
ref_cnts
.
count
(
place
.
device
)
&&
ref_cnts
[
place
.
device
]
->
count
(
var_name
))
{
++
(
*
ref_cnts
[
place
.
device
])[
var_name
];
auto
*
next_compute_op
=
FindNextComputationOpHandle
(
var_handle
);
if
(
next_compute_op
!=
nullptr
)
{
if
(
compute_ref_cnt_map
.
count
(
next_compute_op
))
{
compute_ref_cnt_map
[
next_compute_op
]
->
AddVar
(
var_name
);
VLOG
(
5
)
<<
"Add reference count of "
<<
var_name
<<
" to Operator "
<<
next_compute_op
->
Name
();
}
else
{
// Create new reference_count_op_handle
ir
::
Node
*
ref_cnt_node
=
graph
->
CreateEmptyNode
(
"reference_count"
,
ir
::
Node
::
Type
::
kOperation
);
auto
*
ref_cnt_handle
=
new
ReferenceCountOpHandle
(
ref_cnt_node
,
next_compute_op
->
GetScope
(),
place
,
{
var_name
},
gcs
[
place
.
device
].
get
(),
cur_ref_cnts
[
place
.
device
].
get
());
if
(
next_compute_op
->
Outputs
().
empty
())
{
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
next_compute_op
->
AddOutput
(
dep_var
);
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
}
ref_cnt_handle
->
AddInput
(
next_compute_op
->
Outputs
().
front
());
compute_ref_cnt_map
[
next_compute_op
].
reset
(
ref_cnt_handle
);
}
}
}
}
};
std
::
unordered_map
<
OpHandleBase
*
,
ReferenceCountOpHandle
*>
compute_ref_cnt_map
;
auto
&
all_ops
=
graph
->
Get
<
GraphOps
>
(
kGraphOps
);
for
(
auto
&
op
:
all_ops
)
{
auto
in_var_names
=
get_ref_cnts_from_compute_op
(
op
,
op
->
Inputs
());
...
...
@@ -113,11 +160,13 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
auto
*
ref_cnt_handle
=
new
ReferenceCountOpHandle
(
ref_cnt_node
,
compute_op
->
GetScope
(),
place
,
in_var_names
,
gcs
[
place
.
device
].
get
(),
cur_ref_cnts
[
place
.
device
].
get
());
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
compute_op
->
AddOutput
(
dep_var
);
ref_cnt_handle
->
AddInput
(
dep_var
);
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
compute_ref_cnt_map
[
compute_op
]
=
ref_cnt_handle
;
if
(
compute_op
->
Outputs
().
empty
())
{
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
compute_op
->
AddOutput
(
dep_var
);
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
}
ref_cnt_handle
->
AddInput
(
compute_op
->
Outputs
().
front
());
compute_ref_cnt_map
[
compute_op
].
reset
(
ref_cnt_handle
);
}
for
(
auto
&
op
:
all_ops
)
{
...
...
@@ -131,7 +180,11 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
new_all_ops
.
emplace_back
(
std
::
move
(
op
));
auto
it
=
compute_ref_cnt_map
.
find
(
new_all_ops
.
back
().
get
());
if
(
it
!=
compute_ref_cnt_map
.
end
())
{
new_all_ops
.
emplace_back
(
it
->
second
);
// Add LeafNode to ReferenceCountOpHandle
auto
*
dummy_leaf
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dummy_leaf
);
it
->
second
->
AddOutput
(
dummy_leaf
);
new_all_ops
.
emplace_back
(
std
::
move
(
it
->
second
));
}
}
...
...
paddle/fluid/framework/mixed_vector.h
浏览文件 @
b9d7bd48
...
...
@@ -17,12 +17,10 @@
#include <algorithm>
#include <initializer_list>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h"
...
...
@@ -30,401 +28,206 @@ namespace paddle {
namespace
framework
{
#if defined(PADDLE_WITH_CUDA)
namespace
details
{
struct
CUDABuffer
{
void
*
data_
{
nullptr
};
size_t
size_
{
0
};
platform
::
CUDAPlace
place_
;
CUDABuffer
()
{}
CUDABuffer
(
platform
::
Place
place
,
size_t
size
)
:
size_
(
size
),
place_
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
))
{
data_
=
memory
::
Alloc
(
place_
,
size
);
}
~
CUDABuffer
()
{
ClearMemory
();
}
CUDABuffer
(
const
CUDABuffer
&
o
)
=
delete
;
CUDABuffer
&
operator
=
(
const
CUDABuffer
&
o
)
=
delete
;
void
Resize
(
platform
::
Place
place
,
size_t
size
)
{
ClearMemory
();
place_
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
);
data_
=
memory
::
Alloc
(
place_
,
size
);
size_
=
size
;
}
void
Swap
(
CUDABuffer
&
o
)
{
std
::
swap
(
data_
,
o
.
data_
);
std
::
swap
(
place_
,
o
.
place_
);
std
::
swap
(
size_
,
o
.
size_
);
}
private:
void
ClearMemory
()
const
{
if
(
data_
)
{
memory
::
Free
(
place_
,
data_
);
}
}
};
}
// namespace details
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template
<
typename
T
>
class
Vector
{
public:
using
value_type
=
T
;
using
iterator
=
typename
std
::
vector
<
T
>::
iterator
;
using
const_iterator
=
typename
std
::
vector
<
T
>::
const_iterator
;
private:
// The actual class to implement vector logic
class
VectorData
{
public:
VectorData
()
:
flag_
(
kDataInCPU
)
{}
VectorData
(
size_t
count
,
const
T
&
value
)
:
cpu_
(
count
,
value
),
flag_
(
kDataInCPU
)
{}
VectorData
(
std
::
initializer_list
<
T
>
init
)
:
cpu_
(
init
),
flag_
(
kDataInCPU
)
{}
template
<
typename
U
>
explicit
VectorData
(
const
std
::
vector
<
U
>
&
dat
)
:
cpu_
(
dat
),
flag_
(
kDataInCPU
)
{}
VectorData
(
const
VectorData
&
o
)
{
o
.
ImmutableCPU
();
cpu_
=
o
.
cpu_
;
flag_
=
kDataInCPU
;
}
VectorData
&
operator
=
(
const
VectorData
&
o
)
{
o
.
ImmutableCPU
();
cpu_
=
o
.
cpu_
;
flag_
=
kDataInCPU
;
details
::
CUDABuffer
null
;
gpu_
.
Swap
(
null
);
return
*
this
;
}
T
&
operator
[](
size_t
i
)
{
MutableCPU
();
return
cpu_
[
i
];
}
const
T
&
operator
[](
size_t
i
)
const
{
ImmutableCPU
();
return
cpu_
[
i
];
}
size_t
size
()
const
{
return
cpu_
.
size
();
}
iterator
begin
()
{
MutableCPU
();
return
cpu_
.
begin
();
}
iterator
end
()
{
MutableCPU
();
return
cpu_
.
end
();
}
T
&
front
()
{
MutableCPU
();
return
cpu_
.
front
();
}
T
&
back
()
{
MutableCPU
();
return
cpu_
.
back
();
}
const_iterator
begin
()
const
{
ImmutableCPU
();
return
cpu_
.
begin
();
}
const_iterator
end
()
const
{
ImmutableCPU
();
return
cpu_
.
end
();
}
const
T
&
back
()
const
{
ImmutableCPU
();
return
cpu_
.
back
();
}
T
*
data
()
{
return
&
(
*
this
)[
0
];
}
const
T
*
data
()
const
{
return
&
(
*
this
)[
0
];
}
const
T
&
front
()
const
{
ImmutableCPU
();
return
cpu_
.
front
();
}
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template
<
typename
Iter
>
void
assign
(
Iter
begin
,
Iter
end
)
{
MutableCPU
();
cpu_
.
assign
(
begin
,
end
);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void
push_back
(
T
elem
)
{
MutableCPU
();
cpu_
.
push_back
(
elem
);
}
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template
<
typename
It
>
void
Extend
(
It
begin
,
It
end
)
{
MutableCPU
();
auto
out_it
=
std
::
back_inserter
<
std
::
vector
<
T
>>
(
this
->
cpu_
);
std
::
copy
(
begin
,
end
,
out_it
);
}
// resize the vector
void
resize
(
size_t
size
)
{
MutableCPU
();
cpu_
.
resize
(
size
);
}
// get cuda ptr. immutable
const
T
*
CUDAData
(
platform
::
Place
place
)
const
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
place
),
"CUDA Data must on CUDA place"
);
ImmutableCUDA
(
place
);
return
reinterpret_cast
<
T
*>
(
gpu_
.
data_
);
}
// get cuda ptr. mutable
T
*
CUDAMutableData
(
platform
::
Place
place
)
{
const
T
*
ptr
=
CUDAData
(
place
);
flag_
=
kDirty
|
kDataInCUDA
;
return
const_cast
<
T
*>
(
ptr
);
}
// clear
void
clear
()
{
cpu_
.
clear
();
flag_
=
kDirty
|
kDataInCPU
;
}
size_t
capacity
()
const
{
return
cpu_
.
capacity
();
}
// reserve data
void
reserve
(
size_t
size
)
{
cpu_
.
reserve
(
size
);
}
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator
std
::
vector
<
T
>
()
const
{
ImmutableCPU
();
return
cpu_
;
}
bool
operator
==
(
const
VectorData
&
other
)
const
{
ImmutableCPU
();
other
.
ImmutableCPU
();
return
cpu_
==
other
.
cpu_
;
}
private:
enum
DataFlag
{
kDataInCPU
=
0x01
,
kDataInCUDA
=
0x02
,
// kDirty means the data has been changed in one device.
kDirty
=
0x10
};
void
CopyToCPU
()
const
{
// COPY GPU Data To CPU
void
*
src
=
gpu_
.
data_
;
void
*
dst
=
cpu_
.
data
();
memory
::
Copy
(
platform
::
CPUPlace
(),
dst
,
gpu_
.
place_
,
src
,
gpu_
.
size_
,
nullptr
);
}
void
MutableCPU
()
{
if
(
IsInCUDA
()
&&
IsDirty
())
{
CopyToCPU
();
}
flag_
=
kDirty
|
kDataInCPU
;
}
void
ImmutableCUDA
(
platform
::
Place
place
)
const
{
if
(
IsDirty
())
{
if
(
IsInCPU
())
{
CopyCPUDataToCUDA
(
place
);
UnsetFlag
(
kDirty
);
SetFlag
(
kDataInCUDA
);
}
else
if
(
IsInCUDA
()
&&
!
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
)
==
gpu_
.
place_
))
{
CopyCUDADataToAnotherPlace
(
place
);
// Still dirty
}
else
{
// Dirty && DataInCUDA && Device is same
// Do nothing
}
}
else
{
if
(
!
IsInCUDA
())
{
// Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA
(
place
);
SetFlag
(
kDataInCUDA
);
}
else
if
(
!
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
)
==
gpu_
.
place_
))
{
CopyCUDADataToAnotherPlace
(
place
);
}
else
{
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void
CopyCUDADataToAnotherPlace
(
const
platform
::
Place
&
place
)
const
{
details
::
CUDABuffer
tmp
(
place
,
gpu_
.
size_
);
const
void
*
src
=
gpu_
.
data_
;
void
*
dst
=
tmp
.
data_
;
memory
::
Copy
(
tmp
.
place_
,
dst
,
gpu_
.
place_
,
src
,
gpu_
.
size_
,
nullptr
);
gpu_
.
Swap
(
tmp
);
}
void
CopyCPUDataToCUDA
(
const
platform
::
Place
&
place
)
const
{
void
*
src
=
cpu_
.
data
();
gpu_
.
Resize
(
place
,
cpu_
.
size
()
*
sizeof
(
T
));
void
*
dst
=
gpu_
.
data_
;
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
memory
::
Copy
(
gpu_
.
place_
,
dst
,
platform
::
CPUPlace
(),
src
,
gpu_
.
size_
,
stream
);
}
void
ImmutableCPU
()
const
{
if
(
IsDirty
()
&&
!
IsInCPU
())
{
// If data has been changed in CUDA, or
// CPU has no data.
CopyToCPU
();
UnsetFlag
(
kDirty
);
}
SetFlag
(
kDataInCPU
);
}
void
UnsetFlag
(
int
flag
)
const
{
flag_
&=
~
flag
;
}
void
SetFlag
(
int
flag
)
const
{
flag_
|=
flag
;
}
bool
IsDirty
()
const
{
return
flag_
&
kDirty
;
}
bool
IsInCUDA
()
const
{
return
flag_
&
kDataInCUDA
;
}
bool
IsInCPU
()
const
{
return
flag_
&
kDataInCPU
;
}
mutable
std
::
vector
<
T
>
cpu_
;
mutable
details
::
CUDABuffer
gpu_
;
mutable
int
flag_
;
};
public:
// Default ctor. Create empty Vector
Vector
()
:
m_
(
new
VectorData
())
{
}
Vector
()
{
InitEmpty
();
}
// Fill vector with value. The vector size is `count`.
explicit
Vector
(
size_t
count
,
const
T
&
value
=
T
())
:
m_
(
new
VectorData
(
count
,
value
))
{}
explicit
Vector
(
size_t
count
,
const
T
&
value
=
T
())
{
InitEmpty
();
if
(
count
!=
0
)
{
resize
(
count
);
T
*
ptr
=
begin
();
for
(
size_t
i
=
0
;
i
<
count
;
++
i
)
{
ptr
[
i
]
=
value
;
}
}
}
// Ctor with init_list
Vector
(
std
::
initializer_list
<
T
>
init
)
:
m_
(
new
VectorData
(
init
))
{}
Vector
(
std
::
initializer_list
<
T
>
init
)
{
if
(
init
.
size
()
==
0
)
{
InitEmpty
();
}
else
{
InitByIter
(
init
.
size
(),
init
.
begin
(),
init
.
end
());
}
}
// implicit cast from std::vector.
template
<
typename
U
>
Vector
(
const
std
::
vector
<
U
>
&
dat
)
:
m_
(
new
VectorData
(
dat
))
{
// NOLINT
Vector
(
const
std
::
vector
<
U
>
&
dat
)
{
// NOLINT
if
(
dat
.
size
()
==
0
)
{
InitEmpty
();
}
else
{
InitByIter
(
dat
.
size
(),
dat
.
begin
(),
dat
.
end
());
}
}
// Copy ctor
Vector
(
const
Vector
<
T
>
&
other
)
{
m_
=
other
.
m_
;
}
Vector
(
const
Vector
<
T
>
&
other
)
{
this
->
operator
=
(
other
)
;
}
// Copy operator
Vector
<
T
>
&
operator
=
(
const
Vector
<
T
>
&
other
)
{
m_
=
other
.
m_
;
if
(
other
.
size
()
!=
0
)
{
this
->
InitByIter
(
other
.
size
(),
other
.
begin
(),
other
.
end
());
}
else
{
InitEmpty
();
}
return
*
this
;
}
// Move ctor
Vector
(
Vector
<
T
>
&&
other
)
{
m_
=
std
::
move
(
other
.
m_
);
}
Vector
(
Vector
<
T
>
&&
other
)
{
this
->
size_
=
other
.
size_
;
this
->
flag_
=
other
.
flag_
;
if
(
other
.
cuda_vec_
.
memory_size
())
{
this
->
cuda_vec_
.
ShareDataWith
(
other
.
cuda_vec_
);
}
if
(
other
.
cpu_vec_
.
memory_size
())
{
this
->
cpu_vec_
.
ShareDataWith
(
other
.
cpu_vec_
);
}
}
// CPU data access method. Mutable.
T
&
operator
[](
size_t
i
)
{
return
(
*
m_
)[
i
];
}
T
&
operator
[](
size_t
i
)
{
MutableCPU
();
return
const_cast
<
T
*>
(
cpu_vec_
.
data
<
T
>
())[
i
];
}
// CPU data access method. Immutable.
const
T
&
operator
[](
size_t
i
)
const
{
return
(
*
m_
)[
i
];
}
const
T
&
operator
[](
size_t
i
)
const
{
ImmutableCPU
();
return
cpu_vec_
.
data
<
T
>
()[
i
];
}
// std::vector iterator methods. Based on CPU data access method
size_t
size
()
const
{
return
m_
->
size
()
;
}
size_t
size
()
const
{
return
size_
;
}
iterator
begin
()
{
return
m_
->
begin
(
);
}
T
*
begin
()
{
return
capacity
()
==
0
?
&
EmptyDummy
()
:
&
this
->
operator
[](
0
);
}
iterator
end
()
{
return
m_
->
end
();
}
T
*
end
()
{
return
capacity
()
==
0
?
&
EmptyDummy
()
:
&
this
->
operator
[](
size
());
}
T
&
front
()
{
return
m_
->
front
();
}
T
&
front
()
{
return
*
begin
();
}
T
&
back
()
{
return
m_
->
back
();
}
T
&
back
()
{
auto
it
=
end
();
--
it
;
return
*
it
;
}
const_iterator
begin
()
const
{
return
m_
->
begin
();
}
const
T
*
begin
()
const
{
return
capacity
()
==
0
?
&
EmptyDummy
()
:
&
this
->
operator
[](
0
);
}
const_iterator
end
()
const
{
return
m_
->
end
();
}
const
T
*
end
()
const
{
return
capacity
()
==
0
?
&
EmptyDummy
()
:
&
this
->
operator
[](
size
());
}
const
_iterator
cbegin
()
const
{
return
begin
();
}
const
T
*
cbegin
()
const
{
return
begin
();
}
const
_iterator
cend
()
const
{
return
end
();
}
const
T
*
cend
()
const
{
return
end
();
}
const
T
&
back
()
const
{
return
m_
->
back
();
}
const
T
&
back
()
const
{
auto
it
=
end
();
--
it
;
return
*
it
;
}
T
*
data
()
{
return
m_
->
data
();
}
T
*
data
()
{
return
begin
();
}
const
T
*
data
()
const
{
return
m_
->
data
();
}
const
T
*
data
()
const
{
return
begin
();
}
const
T
&
front
()
const
{
return
m_
->
front
();
}
const
T
&
front
()
const
{
return
*
begin
();
}
// end of std::vector iterator methods
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template
<
typename
Iter
>
void
assign
(
Iter
begin
,
Iter
end
)
{
m_
->
assign
(
begin
,
end
);
InitByIter
(
end
-
begin
,
begin
,
end
);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void
push_back
(
T
elem
)
{
m_
->
push_back
(
elem
);
}
void
push_back
(
T
elem
)
{
if
(
size_
+
1
>
capacity
())
{
reserve
((
size_
+
1
)
<<
1
);
}
*
end
()
=
elem
;
++
size_
;
}
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template
<
typename
It
>
void
Extend
(
It
begin
,
It
end
)
{
m_
->
Extend
(
begin
,
end
);
size_t
pre_size
=
size_
;
resize
(
pre_size
+
(
end
-
begin
));
T
*
ptr
=
this
->
begin
()
+
pre_size
;
for
(;
begin
<
end
;
++
begin
,
++
ptr
)
{
*
ptr
=
*
begin
;
}
}
// resize the vector
void
resize
(
size_t
size
)
{
if
(
m_
.
Data
().
size
()
!=
size
)
{
m_
->
resize
(
size
);
if
(
size
+
1
<=
capacity
())
{
size_
=
size
;
}
else
{
MutableCPU
();
Tensor
cpu_tensor
;
platform
::
Place
cpu
=
platform
::
CPUPlace
();
T
*
ptr
=
cpu_tensor
.
mutable_data
<
T
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
size
)}),
cpu
);
const
T
*
old_ptr
=
cpu_vec_
.
memory_size
()
==
0
?
nullptr
:
cpu_vec_
.
data
<
T
>
();
if
(
old_ptr
!=
nullptr
)
{
std
::
copy
(
old_ptr
,
old_ptr
+
size_
,
ptr
);
}
size_
=
size
;
cpu_vec_
.
ShareDataWith
(
cpu_tensor
);
}
}
// get cuda ptr. immutable
const
T
*
CUDAData
(
platform
::
Place
place
)
const
{
return
m_
.
Data
().
CUDAData
(
place
);
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
place
),
"CUDA Data must on CUDA place"
);
ImmutableCUDA
(
place
);
return
cuda_vec_
.
data
<
T
>
();
}
// get cuda ptr. mutable
T
*
CUDAMutableData
(
platform
::
Place
place
)
{
return
m_
->
CUDAMutableData
(
place
);
const
T
*
ptr
=
CUDAData
(
place
);
flag_
=
kDirty
|
kDataInCUDA
;
return
const_cast
<
T
*>
(
ptr
);
}
// clear
void
clear
()
{
m_
->
clear
();
}
void
clear
()
{
size_
=
0
;
flag_
=
kDirty
|
kDataInCPU
;
}
size_t
capacity
()
const
{
return
m_
->
capacity
();
}
size_t
capacity
()
const
{
return
cpu_vec_
.
memory_size
()
/
SizeOfType
(
typeid
(
T
));
}
// reserve data
void
reserve
(
size_t
size
)
{
m_
->
reserve
(
size
);
}
void
reserve
(
size_t
size
)
{
size_t
pre_size
=
size_
;
resize
(
size
);
resize
(
pre_size
);
}
// the unify method to access CPU or CUDA data. immutable.
const
T
*
Data
(
platform
::
Place
place
)
const
{
...
...
@@ -445,7 +248,12 @@ class Vector {
}
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator
std
::
vector
<
T
>
()
const
{
return
*
m_
;
}
operator
std
::
vector
<
T
>
()
const
{
std
::
vector
<
T
>
result
;
result
.
resize
(
size
());
std
::
copy
(
begin
(),
end
(),
result
.
begin
());
return
result
;
}
bool
operator
==
(
const
Vector
<
T
>
&
other
)
const
{
if
(
size
()
!=
other
.
size
())
return
false
;
...
...
@@ -459,11 +267,118 @@ class Vector {
return
true
;
}
const
void
*
Handle
()
const
{
return
&
m_
.
Data
();
}
private:
// Vector is an COW object.
details
::
COWPtr
<
VectorData
>
m_
;
void
InitEmpty
()
{
size_
=
0
;
flag_
=
kDataInCPU
;
}
template
<
typename
Iter
>
void
InitByIter
(
size_t
size
,
Iter
begin
,
Iter
end
)
{
platform
::
Place
cpu
=
platform
::
CPUPlace
();
T
*
ptr
=
this
->
cpu_vec_
.
template
mutable_data
<
T
>(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
size
)}),
cpu
);
for
(
size_t
i
=
0
;
i
<
size
;
++
i
)
{
*
ptr
++
=
*
begin
++
;
}
flag_
=
kDataInCPU
|
kDirty
;
size_
=
size
;
}
enum
DataFlag
{
kDataInCPU
=
0x01
,
kDataInCUDA
=
0x02
,
// kDirty means the data has been changed in one device.
kDirty
=
0x10
};
void
CopyToCPU
()
const
{
// COPY GPU Data To CPU
TensorCopy
(
cuda_vec_
,
platform
::
CPUPlace
(),
&
cpu_vec_
);
WaitPlace
(
cuda_vec_
.
place
());
}
void
MutableCPU
()
{
if
(
IsInCUDA
()
&&
IsDirty
())
{
CopyToCPU
();
}
flag_
=
kDirty
|
kDataInCPU
;
}
void
ImmutableCUDA
(
platform
::
Place
place
)
const
{
if
(
IsDirty
())
{
if
(
IsInCPU
())
{
TensorCopy
(
cpu_vec_
,
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
&
cuda_vec_
);
WaitPlace
(
place
);
UnsetFlag
(
kDirty
);
SetFlag
(
kDataInCUDA
);
}
else
if
(
IsInCUDA
()
&&
!
(
place
==
cuda_vec_
.
place
()))
{
framework
::
Tensor
tmp
;
TensorCopy
(
cuda_vec_
,
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
&
tmp
);
WaitPlace
(
cuda_vec_
.
place
());
cuda_vec_
.
ShareDataWith
(
tmp
);
// Still dirty
}
else
{
// Dirty && DataInCUDA && Device is same
// Do nothing
}
}
else
{
if
(
!
IsInCUDA
())
{
// Even data is not dirty. However, data is not in CUDA. Copy data.
TensorCopy
(
cpu_vec_
,
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
&
cuda_vec_
);
WaitPlace
(
place
);
SetFlag
(
kDataInCUDA
);
}
else
if
(
!
(
place
==
cuda_vec_
.
place
()))
{
framework
::
Tensor
tmp
;
WaitPlace
(
cuda_vec_
.
place
());
TensorCopy
(
cuda_vec_
,
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
&
tmp
);
WaitPlace
(
cuda_vec_
.
place
());
WaitPlace
(
place
);
cuda_vec_
.
ShareDataWith
(
tmp
);
}
else
{
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void
ImmutableCPU
()
const
{
if
(
IsDirty
()
&&
!
IsInCPU
())
{
// If data has been changed in CUDA, or CPU has no data.
CopyToCPU
();
UnsetFlag
(
kDirty
);
}
SetFlag
(
kDataInCPU
);
}
void
UnsetFlag
(
int
flag
)
const
{
flag_
&=
~
flag
;
}
void
SetFlag
(
int
flag
)
const
{
flag_
|=
flag
;
}
bool
IsDirty
()
const
{
return
flag_
&
kDirty
;
}
bool
IsInCUDA
()
const
{
return
flag_
&
kDataInCUDA
;
}
bool
IsInCPU
()
const
{
return
flag_
&
kDataInCPU
;
}
static
void
WaitPlace
(
const
platform
::
Place
place
)
{
if
(
platform
::
is_gpu_place
(
place
))
{
platform
::
DeviceContextPool
::
Instance
()
.
Get
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
))
->
Wait
();
}
}
static
T
&
EmptyDummy
()
{
static
T
dummy
=
T
();
return
dummy
;
}
mutable
int
flag_
;
mutable
Tensor
cpu_vec_
;
mutable
Tensor
cuda_vec_
;
size_t
size_
;
};
#else // PADDLE_WITH_CUDA
...
...
paddle/fluid/operators/adam_op.h
浏览文件 @
b9d7bd48
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <math.h> // for sqrt in CPU and CUDA
#include <Eigen/Dense>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
...
...
@@ -306,26 +307,43 @@ class AdamOpKernel : public framework::OpKernel<T> {
VLOG
(
3
)
<<
"grad row size is 0!!"
;
return
;
}
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_func
;
auto
&
grad_merge
=
*
(
ctx
.
scope
()
.
NewScope
()
.
Var
(
"sparse_adam_grad_merge"
)
->
GetMutable
<
framework
::
SelectedRows
>
());
merge_func
(
ctx
.
template
device_context
<
DeviceContext
>(),
grad
,
&
grad_merge
);
std
::
vector
<
int64_t
>
cpu_rows
(
grad
.
rows
().
begin
(),
grad
.
rows
().
end
());
bool
is_strict_sorted
=
true
;
for
(
size_t
i
=
1
;
i
<
cpu_rows
.
size
();
++
i
)
{
if
(
cpu_rows
[
i
-
1
]
>=
cpu_rows
[
i
])
{
is_strict_sorted
=
false
;
break
;
}
}
const
framework
::
SelectedRows
*
grad_merge_ptr
;
if
(
is_strict_sorted
)
{
grad_merge_ptr
=
&
grad
;
}
else
{
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_func
;
auto
*
grad_merge_var
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
()
->
GetMutable
<
framework
::
SelectedRows
>
();
merge_func
(
ctx
.
template
device_context
<
DeviceContext
>(),
grad
,
grad_merge_var
);
grad_merge_ptr
=
grad_merge_var
;
}
auto
&
grad_merge
=
*
grad_merge_ptr
;
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
int64_t
*
rows
=
nullptr
;
// When compiled without CUDA, the CUDA
Mutable
Data() interface should not be
const
int64_t
*
rows
=
nullptr
;
// When compiled without CUDA, the CUDAData() interface should not be
// provided.
#if defined(PADDLE_WITH_CUDA)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
rows
=
grad_merge
.
mutable_rows
()
->
CUDAMutable
Data
(
ctx
.
GetPlace
());
rows
=
grad_merge
.
rows
().
CUDA
Data
(
ctx
.
GetPlace
());
}
else
{
#endif
rows
=
grad_merge
.
mutable_rows
()
->
data
();
rows
=
grad_merge
.
rows
().
data
();
#if defined(PADDLE_WITH_CUDA)
}
...
...
paddle/fluid/operators/detection_map_op.h
浏览文件 @
b9d7bd48
...
...
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto
ap_type
=
GetAPType
(
ctx
.
Attr
<
std
::
string
>
(
"ap_type"
));
int
class_num
=
ctx
.
Attr
<
int
>
(
"class_num"
);
auto
&
label_lod
=
in_label
->
lod
();
auto
&
detect_lod
=
in_detect
->
lod
();
auto
label_lod
=
in_label
->
lod
();
auto
detect_lod
=
in_detect
->
lod
();
PADDLE_ENFORCE_EQ
(
label_lod
.
size
(),
1UL
,
"Only support one level sequence now."
);
PADDLE_ENFORCE_EQ
(
label_lod
[
0
].
size
(),
detect_lod
[
0
].
size
(),
...
...
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto
labels
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
input_label
);
auto
detect
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
input_detect
);
auto
&
label_lod
=
input_label
.
lod
();
auto
&
detect_lod
=
input_detect
.
lod
();
auto
label_lod
=
input_label
.
lod
();
auto
detect_lod
=
input_detect
.
lod
();
int
batch_size
=
label_lod
[
0
].
size
()
-
1
;
auto
&
label_index
=
label_lod
[
0
];
auto
label_index
=
label_lod
[
0
];
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
std
::
map
<
int
,
std
::
vector
<
Box
>>
boxes
;
...
...
@@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos
->
set_lod
(
true_pos_lod
);
output_false_pos
->
set_lod
(
false_pos_lod
);
return
;
}
void
GetInputPos
(
const
framework
::
Tensor
&
input_pos_count
,
...
...
@@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto
SetData
=
[](
const
framework
::
LoDTensor
&
pos_tensor
,
std
::
map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
int
>>>&
pos
)
{
const
T
*
pos_data
=
pos_tensor
.
data
<
T
>
();
auto
&
pos_data_lod
=
pos_tensor
.
lod
()[
0
];
auto
pos_data_lod
=
pos_tensor
.
lod
()[
0
];
for
(
size_t
i
=
0
;
i
<
pos_data_lod
.
size
()
-
1
;
++
i
)
{
for
(
size_t
j
=
pos_data_lod
[
i
];
j
<
pos_data_lod
[
i
+
1
];
++
j
)
{
T
score
=
pos_data
[
j
*
2
];
...
...
@@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std
::
map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
int
>>>*
false_pos
)
const
{
int
batch_size
=
gt_boxes
.
size
();
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
auto
&
image_gt_boxes
=
gt_boxes
[
n
];
for
(
auto
&
image_gt_box
:
image_gt_boxes
)
{
auto
image_gt_boxes
=
gt_boxes
[
n
];
for
(
auto
it
=
image_gt_boxes
.
begin
();
it
!=
image_gt_boxes
.
end
();
++
it
)
{
size_t
count
=
0
;
auto
&
labeled_bboxes
=
image_gt_box
.
second
;
auto
labeled_bboxes
=
it
->
second
;
if
(
evaluate_difficult
)
{
count
=
labeled_bboxes
.
size
();
}
else
{
for
(
auto
&
box
:
labeled_bboxes
)
{
if
(
!
box
.
is_difficult
)
{
++
count
;
}
}
for
(
size_t
i
=
0
;
i
<
labeled_bboxes
.
size
();
++
i
)
if
(
!
(
labeled_bboxes
[
i
].
is_difficult
))
++
count
;
}
if
(
count
==
0
)
{
continue
;
}
int
label
=
i
mage_gt_box
.
first
;
int
label
=
i
t
->
first
;
if
(
label_pos_count
->
find
(
label
)
==
label_pos_count
->
end
())
{
(
*
label_pos_count
)[
label
]
=
count
;
}
else
{
...
...
paddle/fluid/operators/extract_rows_op.cc
浏览文件 @
b9d7bd48
...
...
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto
&
in
=
scope
.
FindVar
(
Input
(
"X"
))
->
Get
<
framework
::
SelectedRows
>
();
auto
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
&
in_rows
=
in
.
rows
();
auto
in_rows
=
in
.
rows
();
auto
out_dim
=
framework
::
make_ddim
(
std
::
vector
<
int64_t
>
{
static_cast
<
int64_t
>
(
in_rows
.
size
()),
1
});
auto
dst_ptr
=
out
->
mutable_data
<
int64_t
>
(
out_dim
,
in
.
place
());
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
b9d7bd48
...
...
@@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto
out_place
=
context
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
out_place
));
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
out_place
),
out_data
,
boost
::
get
<
platform
::
CUDAPlace
>
(
in1_place
),
in1_data
,
in1_value
.
numel
()
*
sizeof
(
T
),
context
.
stream
());
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
out_place
),
out_data
,
boost
::
get
<
platform
::
CUDAPlace
>
(
in1_place
),
in1_data
,
in1_value
.
numel
()
*
sizeof
(
T
),
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
context
).
stream
());
auto
*
in2_data
=
in2_value
.
data
<
T
>
();
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
out_place
),
...
...
@@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto
in1_height
=
input1
.
height
();
PADDLE_ENFORCE_EQ
(
in1_height
,
input2
->
height
());
auto
&
in1_rows
=
input1
.
rows
(
);
framework
::
Vector
<
int64_t
>
in1_rows
(
input1
.
rows
()
);
auto
&
in2_rows
=
*
(
input2
->
mutable_rows
());
auto
&
in1_value
=
input1
.
value
();
...
...
paddle/fluid/operators/sum_op.h
浏览文件 @
b9d7bd48
...
...
@@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> {
out_value
->
Resize
(
framework
::
make_ddim
(
in_dim
));
out_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
// if all the input sparse vars are empty, no need to
// merge these vars.
if
(
first_dim
==
0UL
)
{
...
...
python/paddle/fluid/initializer.py
浏览文件 @
b9d7bd48
...
...
@@ -74,7 +74,7 @@ class Initializer(object):
directly, but need to use one of its implementations.
"""
def
__init_
(
self
):
def
__init_
_
(
self
):
pass
def
__call__
(
self
,
param
,
block
):
...
...
@@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer):
assert
loc
is
not
None
assert
scale
is
not
None
assert
seed
is
not
None
super
(
NormalInitializer
,
self
).
__init__
()
super
(
Truncated
NormalInitializer
,
self
).
__init__
()
self
.
_mean
=
loc
self
.
_std_dev
=
scale
self
.
_seed
=
seed
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
b9d7bd48
...
...
@@ -107,6 +107,12 @@ __all__ = [
'log'
,
'crop'
,
'rank_loss'
,
'elu'
,
'relu6'
,
'pow'
,
'stanh'
,
'hard_sigmoid'
,
'swish'
,
'prelu'
,
'brelu'
,
'leaky_relu'
,
...
...
@@ -5898,6 +5904,148 @@ def pad2d(input,
return
out
@
templatedoc
()
def
elu
(
x
,
alpha
=
1.0
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
alpha(${alpha_type}|1.0): ${alpha_comment}
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
output(${out_type}): ${out_comment}
"""
helper
=
LayerHelper
(
'elu'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'elu'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'alpha'
:
alpha
})
return
out
@
templatedoc
()
def
relu6
(
x
,
threshold
=
6.0
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
threshold(${threshold_type}|6.0): ${threshold_comment}
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
output(${out_type}): ${out_comment}
"""
helper
=
LayerHelper
(
'relu6'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'relu6'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'threshold'
:
threshold
})
return
out
@
templatedoc
()
def
pow
(
x
,
factor
=
1.0
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
factor(${factor_type}|1.0): ${factor_comment}
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
output(${out_type}): ${out_comment}
"""
helper
=
LayerHelper
(
'pow'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'pow'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'factor'
:
factor
})
return
out
@
templatedoc
()
def
stanh
(
x
,
scale_a
=
2.0
/
3.0
,
scale_b
=
1.7159
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
scale_a(${scale_a_type}|2.0 / 3.0): ${scale_a_comment}
scale_b(${scale_b_type}|1.7159): ${scale_b_comment}
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
output(${out_type}): ${out_comment}
"""
helper
=
LayerHelper
(
'stanh'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'stanh'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'scale_a'
:
scale_a
,
'scale_b'
:
scale_b
})
return
out
@
templatedoc
()
def
hard_sigmoid
(
x
,
slope
=
0.2
,
offset
=
0.5
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
slope(${slope_type}|0.2): ${slope_comment}
offset(${offset_type}|0.5): ${offset_comment}
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
output(${out_type}): ${out_comment}
"""
helper
=
LayerHelper
(
'hard_sigmoid'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'hard_sigmoid'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'slope'
:
slope
,
'offset'
:
offset
})
return
out
@
templatedoc
()
def
swish
(
x
,
beta
=
1.0
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
beta(${beta_type}|1.0): ${beta_comment}
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
output(${out_type}): ${out_comment}
"""
helper
=
LayerHelper
(
'swish'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'swish'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'slope'
:
beta
})
return
out
def
prelu
(
x
,
mode
,
param_attr
=
None
,
name
=
None
):
"""
Equation:
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
b9d7bd48
...
...
@@ -17,12 +17,6 @@ from .layer_function_generator import generate_layer_fn, generate_layer_fn_noatt
__activations__
=
[
'softshrink'
,
'elu'
,
'relu6'
,
'pow'
,
'stanh'
,
'hard_sigmoid'
,
'swish'
,
]
__activations_noattr__
=
[
...
...
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
b9d7bd48
...
...
@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
actual_t
,
expect_t
,
atol
=
atol
,
equal_nan
=
equal_nan
),
"Output ("
+
out_name
+
") has diff at "
+
str
(
place
)
+
"
\n
Expect "
+
str
(
expect_t
)
+
"
\n
"
+
"But Got"
+
str
(
actual_t
)
+
" in class "
+
self
.
__class__
.
__name__
)
str
(
actual_t
))
if
isinstance
(
expect
,
tuple
):
self
.
assertListEqual
(
actual
.
recursive_sequence_lengths
(),
expect
[
1
],
"Output ("
+
out_name
+
...
...
python/paddle/fluid/tests/unittests/test_detection_map_op.py
浏览文件 @
b9d7bd48
...
...
@@ -20,7 +20,6 @@ import six
import
sys
import
collections
import
math
import
paddle.fluid
as
fluid
from
op_test
import
OpTest
...
...
@@ -33,7 +32,7 @@ class TestDetectionMAPOp(OpTest):
self
.
detect
=
np
.
array
(
self
.
detect
).
astype
(
'float32'
)
self
.
mAP
=
np
.
array
(
self
.
mAP
).
astype
(
'float32'
)
if
len
(
self
.
class_pos_count
)
>
0
:
if
(
len
(
self
.
class_pos_count
)
>
0
)
:
self
.
class_pos_count
=
np
.
array
(
self
.
class_pos_count
).
astype
(
'int32'
)
self
.
true_pos
=
np
.
array
(
self
.
true_pos
).
astype
(
'float32'
)
...
...
@@ -274,7 +273,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
class
TestDetectionMAPOpMultiBatch
(
TestDetectionMAPOp
):
def
init_test_case
(
self
):
super
(
TestDetectionMAPOpMultiBatch
,
self
).
init_test_case
()
self
.
class_pos_count
=
[
0
,
2
,
1
,
0
]
self
.
class_pos_count
=
[
0
,
2
,
1
]
self
.
true_pos_lod
=
[[
0
,
3
,
2
]]
self
.
true_pos
=
[[
0.7
,
1.
],
[
0.3
,
0.
],
[
0.2
,
1.
],
[
0.8
,
0.
],
[
0.1
,
1.
]]
self
.
false_pos_lod
=
[[
0
,
3
,
2
]]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录