Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
cec4e6ed
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
cec4e6ed
编写于
4月 20, 2018
作者:
C
chengduo
提交者:
GitHub
4月 20, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #9946 from chengduoZH/feature/add_reduce_op_handle
Feature/add reduce op handle
上级
23a21c86
e63013a8
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
609 addition
and
30 deletion
+609
-30
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+8
-3
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+1
-27
paddle/fluid/framework/details/reduce_and_gather.h
paddle/fluid/framework/details/reduce_and_gather.h
+94
-0
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+161
-0
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+70
-0
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+275
-0
未找到文件。
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
cec4e6ed
...
...
@@ -2,8 +2,6 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library
(
op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor
)
cc_library
(
scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
cc_library
(
fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
nv_library
(
nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda
)
cc_library
(
computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry
)
cc_library
(
send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry
)
...
...
@@ -11,12 +9,17 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library
(
ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph
)
if
(
WITH_GPU
)
nv_library
(
nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda
)
set
(
multi_devices_graph_builder_deps nccl_all_reduce_op_handle
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda
)
else
()
set
(
multi_devices_graph_builder_deps
)
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim
)
endif
()
cc_library
(
multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle send_op_handle
${
multi_devices_graph_builder_deps
}
)
scale_loss_grad_op_handle send_op_handle
${
multi_devices_graph_builder_deps
}
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto
)
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context
)
...
...
@@ -30,3 +33,5 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
device_context broadcast_op_handle
)
cc_test
(
gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context gather_op_handle
)
cc_test
(
reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context reduce_op_handle
)
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
浏览文件 @
cec4e6ed
...
...
@@ -13,8 +13,8 @@
// limitations under the License.
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/reduce_and_gather.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -29,32 +29,6 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
}
}
struct
ReduceLoDTensor
{
const
std
::
vector
<
LoDTensor
>
&
src_tensors_
;
LoDTensor
&
dst_tensor_
;
ReduceLoDTensor
(
const
std
::
vector
<
LoDTensor
>
&
src
,
LoDTensor
*
dst
)
:
src_tensors_
(
src
),
dst_tensor_
(
*
dst
)
{}
template
<
typename
T
>
void
operator
()()
const
{
PADDLE_ENFORCE
(
!
src_tensors_
.
empty
());
auto
&
t0
=
src_tensors_
[
0
];
PADDLE_ENFORCE_NE
(
t0
.
numel
(),
0
);
dst_tensor_
.
Resize
(
t0
.
dims
());
T
*
dst
=
dst_tensor_
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
copy
(
t0
.
data
<
T
>
(),
t0
.
data
<
T
>
()
+
t0
.
numel
(),
dst
);
for
(
size_t
i
=
1
;
i
<
src_tensors_
.
size
();
++
i
)
{
auto
&
t
=
src_tensors_
[
i
];
PADDLE_ENFORCE_EQ
(
t
.
dims
(),
t0
.
dims
());
PADDLE_ENFORCE_EQ
(
t
.
type
(),
t0
.
type
());
std
::
transform
(
t
.
data
<
T
>
(),
t
.
data
<
T
>
()
+
t
.
numel
(),
dst
,
dst
,
[](
T
a
,
T
b
)
->
T
{
return
a
+
b
;
});
}
}
};
void
NCCLAllReduceOpHandle
::
RunImpl
()
{
if
(
inputs_
.
size
()
==
1
)
{
return
;
// No need to all reduce when GPU count = 1;
...
...
paddle/fluid/framework/details/reduce_and_gather.h
0 → 100644
浏览文件 @
cec4e6ed
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <map>
#include <vector>
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
struct
ReduceLoDTensor
{
const
std
::
vector
<
LoDTensor
>
&
src_tensors_
;
LoDTensor
&
dst_tensor_
;
ReduceLoDTensor
(
const
std
::
vector
<
LoDTensor
>
&
src
,
LoDTensor
*
dst
)
:
src_tensors_
(
src
),
dst_tensor_
(
*
dst
)
{}
template
<
typename
T
>
void
operator
()()
const
{
PADDLE_ENFORCE
(
!
src_tensors_
.
empty
());
auto
&
t0
=
src_tensors_
[
0
];
PADDLE_ENFORCE_NE
(
t0
.
numel
(),
0
);
dst_tensor_
.
Resize
(
t0
.
dims
());
T
*
dst
=
dst_tensor_
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
copy
(
t0
.
data
<
T
>
(),
t0
.
data
<
T
>
()
+
t0
.
numel
(),
dst
);
for
(
size_t
i
=
1
;
i
<
src_tensors_
.
size
();
++
i
)
{
auto
&
t
=
src_tensors_
[
i
];
PADDLE_ENFORCE_EQ
(
t
.
dims
(),
t0
.
dims
());
PADDLE_ENFORCE_EQ
(
t
.
type
(),
t0
.
type
());
std
::
transform
(
t
.
data
<
T
>
(),
t
.
data
<
T
>
()
+
t
.
numel
(),
dst
,
dst
,
[](
T
a
,
T
b
)
->
T
{
return
a
+
b
;
});
}
}
};
inline
void
GatherSelectedRows
(
const
std
::
vector
<
const
SelectedRows
*>
&
src_selecte_rows_
,
const
std
::
vector
<
platform
::
Place
>
&
in_places
,
const
std
::
unordered_map
<
platform
::
Place
,
platform
::
DeviceContext
*
,
platform
::
PlaceHash
>
&
dev_ctxes
,
const
platform
::
Place
&
out_place
,
SelectedRows
*
dst_selecte_rows
)
{
PADDLE_ENFORCE
(
!
src_selecte_rows_
.
empty
());
std
::
vector
<
Tensor
>
in_tensors
;
std
::
vector
<
int64_t
>
out_rows
;
for
(
auto
in_sr_ptr
:
src_selecte_rows_
)
{
auto
&
in_sr
=
*
in_sr_ptr
;
in_tensors
.
emplace_back
(
in_sr
.
value
());
out_rows
.
insert
(
out_rows
.
end
(),
in_sr
.
rows
().
begin
(),
in_sr
.
rows
().
end
());
}
auto
&
pre_in
=
src_selecte_rows_
[
0
];
auto
&
dst_tensor
=
*
dst_selecte_rows
;
dst_tensor
.
set_height
(
pre_in
->
height
());
dst_tensor
.
set_rows
(
out_rows
);
size_t
rows
=
out_rows
.
size
();
DDim
out_dim
=
pre_in
->
GetCompleteDims
();
out_dim
[
0
]
=
static_cast
<
int64_t
>
(
rows
);
dst_tensor
.
mutable_value
()
->
Resize
(
out_dim
);
dst_tensor
.
mutable_value
()
->
mutable_data
(
out_place
,
pre_in
->
value
().
type
());
Tensor
*
out_tensor
=
dst_tensor
.
mutable_value
();
// copy
int
s
=
0
,
e
=
0
;
for
(
size_t
j
=
0
;
j
<
in_tensors
.
size
();
++
j
)
{
e
+=
in_tensors
[
j
].
dims
()[
0
];
auto
sub_out
=
out_tensor
->
Slice
(
s
,
e
);
paddle
::
framework
::
TensorCopy
(
in_tensors
[
j
],
out_place
,
*
(
dev_ctxes
.
at
(
in_places
[
j
])),
&
sub_out
);
s
=
e
;
}
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/reduce_op_handle.cc
0 → 100644
浏览文件 @
cec4e6ed
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
void
ReduceOpHandle
::
RunImpl
()
{
// the input and output may have dummy var.
std
::
vector
<
VarHandle
*>
in_var_handles
=
GetValidVarHandles
(
inputs_
);
std
::
vector
<
VarHandle
*>
out_var_handles
=
GetValidVarHandles
(
outputs_
);
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
places_
.
size
(),
"The number of output should equal to the number of places."
);
PADDLE_ENFORCE_EQ
(
out_var_handles
.
size
(),
1
,
"The number of output should be one."
);
// Wait input done, this Wait is asynchronous operation
WaitEvents
(
in_var_handles
);
// check in the same place
auto
in_0_handle
=
in_var_handles
[
0
];
auto
pre_place
=
in_0_handle
->
place_
;
std
::
vector
<
platform
::
Place
>
in_places
;
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
in_p
=
in_handle
->
place_
;
PADDLE_ENFORCE_EQ
(
in_p
.
which
(),
pre_place
.
which
(),
"Places must be all on CPU or all on CUDA."
);
in_places
.
emplace_back
(
in_p
);
}
auto
out_var
=
local_scopes_
[
out_var_handles
[
0
]
->
scope_idx_
]
->
FindVar
(
out_var_handles
[
0
]
->
name_
);
auto
pre_in_var
=
local_scopes_
[
in_0_handle
->
scope_idx_
]
->
FindVar
(
in_0_handle
->
name_
);
if
(
pre_in_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
pre_in
=
pre_in_var
->
Get
<
framework
::
SelectedRows
>
();
std
::
vector
<
const
SelectedRows
*>
in_selected_rows
;
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
in_var
=
local_scopes_
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
);
auto
&
in_sr
=
in_var
->
Get
<
framework
::
SelectedRows
>
();
PADDLE_ENFORCE_EQ
(
in_sr
.
value
().
type
(),
pre_in
.
value
().
type
(),
"The type of input is not consistent."
);
in_selected_rows
.
emplace_back
(
&
in_sr
);
}
auto
trg
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
GatherSelectedRows
(
in_selected_rows
,
in_places
,
dev_ctxes_
,
out_var_handles
[
0
]
->
place_
,
trg
);
}
else
{
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
std
::
vector
<
LoDTensor
>
lod_tensors
;
// can be refined
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
in_var
=
local_scopes_
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
);
auto
&
in_sr
=
in_var
->
Get
<
framework
::
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
in_sr
.
type
(),
pre_in
.
type
(),
"The type of input is not consistent."
);
lod_tensors
.
emplace_back
(
in_sr
);
}
auto
trg
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
trg
->
Resize
(
pre_in
.
dims
());
trg
->
mutable_data
(
out_var_handles
[
0
]
->
place_
,
pre_in
.
type
());
if
(
paddle
::
platform
::
is_cpu_place
(
pre_place
))
{
ReduceLoDTensor
func
(
lod_tensors
,
trg
);
VisitDataType
(
ToDataType
(
lod_tensors
[
0
].
type
()),
func
);
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
pre_place
))
{
#ifdef PADDLE_WITH_CUDA
auto
out_p
=
out_var_handles
[
0
]
->
place_
;
int
root
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_p
).
device
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
p
=
in_places
[
i
];
auto
&
lod_tensor
=
lod_tensors
[
i
];
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor
.
data
<
void
>
());
void
*
recvbuffer
=
nullptr
;
if
(
root
==
dev_id
)
{
recvbuffer
=
trg
->
mutable_data
(
out_var_handles
[
0
]
->
place_
);
}
all_reduce_calls
.
emplace_back
([
=
]
{
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
buffer
,
recvbuffer
,
static_cast
<
size_t
>
(
lod_tensor
.
numel
()),
platform
::
ToNCCLDataType
(
lod_tensor
.
type
()),
ncclSum
,
root
,
comm
,
stream
));
});
}
this
->
RunAndRecordEvent
([
&
]
{
platform
::
NCCLGroupGuard
guard
;
for
(
auto
&
call
:
all_reduce_calls
)
{
call
();
}
});
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
PADDLE_THROW
(
"Place should be CPUPlace or CUDAPlace."
);
}
}
}
void
ReduceOpHandle
::
WaitEvents
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
)
{
if
(
in_var_handles
[
0
]
->
generated_op_
)
{
for
(
auto
*
in
:
in_var_handles
)
{
in_var_handles
[
0
]
->
generated_op_
->
Wait
(
dev_ctxes_
[
in
->
place_
]);
}
}
}
std
::
vector
<
VarHandle
*>
ReduceOpHandle
::
GetValidVarHandles
(
const
std
::
vector
<
VarHandleBase
*>
&
inputs
)
{
std
::
vector
<
VarHandle
*>
in_var_handles
;
for
(
auto
*
in
:
inputs
)
{
auto
*
in_handle
=
dynamic_cast
<
VarHandle
*>
(
in
);
if
(
in_handle
)
{
in_var_handles
.
push_back
(
in_handle
);
}
}
return
in_var_handles
;
}
std
::
string
ReduceOpHandle
::
Name
()
const
{
return
"reduce"
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/reduce_op_handle.h
0 → 100644
浏览文件 @
cec4e6ed
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
namespace
details
{
struct
ReduceOpHandle
:
public
OpHandleBase
{
const
std
::
vector
<
Scope
*>
&
local_scopes_
;
const
std
::
vector
<
platform
::
Place
>
&
places_
;
#ifdef PADDLE_WITH_CUDA
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
ReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
nccl_ctxs
)
:
local_scopes_
(
local_scopes
),
places_
(
places
),
nccl_ctxs_
(
nccl_ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p_ctx
:
nccl_ctxs_
->
contexts_
)
{
dev_ctxes_
[
platform
::
CUDAPlace
(
p_ctx
.
first
)]
=
p_ctx
.
second
.
ctx_
.
get
();
}
}
}
#else
ReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
)
:
local_scopes_
(
local_scopes
),
places_
(
places
)
{}
#endif
std
::
string
Name
()
const
override
;
bool
IsMultiDeviceTransfer
()
override
{
return
false
;
};
protected:
void
RunImpl
()
override
;
std
::
vector
<
VarHandle
*>
GetValidVarHandles
(
const
std
::
vector
<
VarHandleBase
*>
&
inputs
);
void
WaitEvents
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
);
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/reduce_op_handle_test.cc
0 → 100644
浏览文件 @
cec4e6ed
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
struct
TestReduceOpHandle
{
bool
use_gpu_
;
Scope
g_scope_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
std
::
vector
<
p
::
Place
>
gpu_list_
;
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
void
WaitAll
()
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#ifdef PADDLE_WITH_CUDA
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
#endif
}
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
if
(
use_gpu
)
{
#ifdef PADDLE_WITH_CUDA
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<<
count
;
exit
(
0
);
}
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CUDAPlace
(
i
);
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CUDADeviceContext
(
p
));
}
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
gpu_list_
));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
int
count
=
8
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CPUPlace
();
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
}
void
InitReduceOp
(
size_t
input_scope_idx
)
{
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
local_scopes_
[
j
]
->
Var
(
"out"
);
}
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
ReduceOpHandle
(
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
ReduceOpHandle
(
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
op_handle_
.
reset
(
new
ReduceOpHandle
(
local_scopes_
,
gpu_list_
));
#endif
}
// add input
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
if
(
!
use_gpu_
)
{
op_handle_
->
dev_ctxes_
[
gpu_list_
[
j
]]
=
ctxs_
[
j
].
get
();
}
auto
*
in_var_handle
=
new
VarHandle
(
1
,
j
,
"input"
,
gpu_list_
[
j
]);
in_var_handle
->
generated_op_
=
nullptr
;
vars_
.
emplace_back
(
in_var_handle
);
op_handle_
->
AddInput
(
in_var_handle
);
}
// add dummy var
vars_
.
emplace_back
(
new
DummyVarHandle
());
DummyVarHandle
*
in_dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
in_dummy_var_handle
->
generated_op_
=
nullptr
;
op_handle_
->
AddInput
(
in_dummy_var_handle
);
// add output
auto
*
out_var_handle
=
new
VarHandle
(
2
,
input_scope_idx
,
"out"
,
gpu_list_
[
input_scope_idx
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
// add dummy var
vars_
.
emplace_back
(
new
DummyVarHandle
());
DummyVarHandle
*
dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
op_handle_
->
AddOutput
(
dummy_var_handle
);
}
void
TestReduceSelectedRows
(
size_t
output_scope_idx
)
{
int
height
=
kDims
[
0
]
*
2
;
std
::
vector
<
int64_t
>
rows
{
0
,
1
,
2
,
3
,
3
,
0
,
14
,
7
,
3
,
1
,
2
,
4
,
6
,
3
,
1
,
1
,
1
,
1
,
3
,
7
};
std
::
vector
<
float
>
send_vector
(
f
::
product
(
kDims
));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
;
}
for
(
size_t
input_scope_idx
=
0
;
input_scope_idx
<
gpu_list_
.
size
();
++
input_scope_idx
)
{
auto
in_var
=
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
in_selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
in_selected_rows
->
set_height
(
height
);
in_selected_rows
->
set_rows
(
rows
);
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
value
);
value
->
Resize
(
kDims
);
}
auto
out_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"out"
);
auto
out_selected_rows
=
out_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
in_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"input"
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
in_selected_rows
->
value
());
op_handle_
->
Run
(
false
);
WaitAll
();
p
::
CPUPlace
cpu_place
;
auto
&
out_select_rows
=
out_var
->
Get
<
f
::
SelectedRows
>
();
auto
rt
=
out_select_rows
.
value
();
PADDLE_ENFORCE_EQ
(
out_select_rows
.
height
(),
height
,
"height is not equal."
);
for
(
size_t
k
=
0
;
k
<
out_select_rows
.
rows
().
size
();
++
k
)
{
PADDLE_ENFORCE_EQ
(
out_select_rows
.
rows
()[
k
],
rows
[
k
%
rows
.
size
()]);
}
f
::
Tensor
result_tensor
;
f
::
TensorCopy
(
rt
,
cpu_place
,
*
(
ctxs_
[
output_scope_idx
]),
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
j
=
0
;
j
<
f
::
product
(
result_tensor
.
dims
());
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
send_vector
[
j
%
send_vector
.
size
()],
1e-5
);
}
}
void
TestReduceLodTensors
(
size_t
output_scope_idx
)
{
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
;
}
f
::
LoD
lod
{{
0
,
10
,
20
}};
for
(
size_t
input_scope_idx
=
0
;
input_scope_idx
<
gpu_list_
.
size
();
++
input_scope_idx
)
{
auto
in_var
=
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
auto
in_lod_tensor
=
in_var
->
GetMutable
<
f
::
LoDTensor
>
();
in_lod_tensor
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
in_lod_tensor
->
set_lod
(
lod
);
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
in_lod_tensor
);
}
auto
out_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"out"
);
auto
out_lodtensor
=
out_var
->
GetMutable
<
f
::
LoDTensor
>
();
auto
in_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"input"
);
auto
in_lodtensor
=
in_var
->
Get
<
f
::
LoDTensor
>
();
out_lodtensor
->
ShareDataWith
(
in_lodtensor
);
op_handle_
->
Run
(
false
);
WaitAll
();
p
::
CPUPlace
cpu_place
;
auto
&
rt
=
out_var
->
Get
<
f
::
LoDTensor
>
();
f
::
Tensor
result_tensor
;
f
::
TensorCopy
(
rt
,
cpu_place
,
*
(
ctxs_
[
output_scope_idx
]),
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
j
=
0
;
j
<
f
::
product
(
result_tensor
.
dims
());
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
send_vector
[
j
]
*
gpu_list_
.
size
(),
1e-5
);
}
}
};
TEST
(
ReduceTester
,
TestCPUReduceTestSelectedRows
)
{
TestReduceOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitReduceOp
(
input_scope_idx
);
test_op
.
TestReduceSelectedRows
(
input_scope_idx
);
}
TEST
(
ReduceTester
,
TestCPUReduceTestLodTensor
)
{
TestReduceOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitReduceOp
(
input_scope_idx
);
test_op
.
TestReduceLodTensors
(
input_scope_idx
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
ReduceTester
,
TestGPUReduceTestSelectedRows
)
{
TestReduceOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitReduceOp
(
input_scope_idx
);
test_op
.
TestReduceSelectedRows
(
input_scope_idx
);
}
TEST
(
ReduceTester
,
TestGPUReduceTestLodTensor
)
{
TestReduceOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitReduceOp
(
input_scope_idx
);
test_op
.
TestReduceLodTensors
(
input_scope_idx
);
}
#endif
}
// namespace details
}
// namespace framework
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录