Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
35d7d1f0
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
35d7d1f0
编写于
2月 08, 2023
作者:
H
Huang Jiyi
提交者:
GitHub
2月 08, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move mixed_vector (#50282)
上级
e92e3aab
变更
83
显示空白变更内容
内联
并排
Showing
83 changed file
with
299 addition
and
305 deletion
+299
-305
paddle/fluid/distributed/collective/reducer.cc
paddle/fluid/distributed/collective/reducer.cc
+3
-3
paddle/fluid/distributed/ps/service/brpc_utils.cc
paddle/fluid/distributed/ps/service/brpc_utils.cc
+1
-1
paddle/fluid/distributed/test/brpc_utils_test.cc
paddle/fluid/distributed/test/brpc_utils_test.cc
+4
-4
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+0
-20
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+1
-1
paddle/fluid/framework/data_type_test.cc
paddle/fluid/framework/data_type_test.cc
+1
-0
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+1
-0
paddle/fluid/framework/dlpack_tensor_test.cc
paddle/fluid/framework/dlpack_tensor_test.cc
+1
-0
paddle/fluid/framework/eigen_test.cc
paddle/fluid/framework/eigen_test.cc
+3
-2
paddle/fluid/framework/fleet/heter_wrapper.cc
paddle/fluid/framework/fleet/heter_wrapper.cc
+2
-2
paddle/fluid/framework/lod_tensor.h
paddle/fluid/framework/lod_tensor.h
+2
-2
paddle/fluid/framework/lod_tensor_test.cu
paddle/fluid/framework/lod_tensor_test.cu
+2
-2
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+2
-2
paddle/fluid/imperative/all_reduce.cc
paddle/fluid/imperative/all_reduce.cc
+4
-4
paddle/fluid/imperative/gloo_context.cc
paddle/fluid/imperative/gloo_context.cc
+2
-2
paddle/fluid/operators/assign_op_test.cc
paddle/fluid/operators/assign_op_test.cc
+1
-1
paddle/fluid/operators/ctc_align_op.cu
paddle/fluid/operators/ctc_align_op.cu
+1
-1
paddle/fluid/operators/cvm_op.cu
paddle/fluid/operators/cvm_op.cu
+1
-1
paddle/fluid/operators/detection/box_clip_op.cu
paddle/fluid/operators/detection/box_clip_op.cu
+1
-1
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+1
-1
paddle/fluid/operators/detection/generate_proposals_op.cu
paddle/fluid/operators/detection/generate_proposals_op.cu
+1
-1
paddle/fluid/operators/detection/target_assign_op.h
paddle/fluid/operators/detection/target_assign_op.h
+2
-2
paddle/fluid/operators/filter_by_instag_op.cu
paddle/fluid/operators/filter_by_instag_op.cu
+8
-8
paddle/fluid/operators/filter_by_instag_op.h
paddle/fluid/operators/filter_by_instag_op.h
+2
-2
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+1
-1
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+6
-6
paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+2
-2
paddle/fluid/operators/gru_op.cc
paddle/fluid/operators/gru_op.cc
+1
-1
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+1
-1
paddle/fluid/operators/gru_op.h
paddle/fluid/operators/gru_op.h
+2
-2
paddle/fluid/operators/lookup_table_op.cu
paddle/fluid/operators/lookup_table_op.cu
+2
-2
paddle/fluid/operators/lookup_table_v2_op.cu
paddle/fluid/operators/lookup_table_v2_op.cu
+2
-2
paddle/fluid/operators/lstm_op.h
paddle/fluid/operators/lstm_op.h
+3
-3
paddle/fluid/operators/lstmp_op.h
paddle/fluid/operators/lstmp_op.h
+3
-3
paddle/fluid/operators/math/beam_search.cu
paddle/fluid/operators/math/beam_search.cu
+2
-2
paddle/fluid/operators/math/sequence_padding.cc
paddle/fluid/operators/math/sequence_padding.cc
+1
-1
paddle/fluid/operators/math/sequence_padding.cu
paddle/fluid/operators/math/sequence_padding.cu
+2
-2
paddle/fluid/operators/math/sequence_padding.h
paddle/fluid/operators/math/sequence_padding.h
+3
-3
paddle/fluid/operators/math/sequence_pooling.cu
paddle/fluid/operators/math/sequence_pooling.cu
+2
-2
paddle/fluid/operators/optimizers/ftrl_op.h
paddle/fluid/operators/optimizers/ftrl_op.h
+1
-1
paddle/fluid/operators/optimizers/sgd_op.cu
paddle/fluid/operators/optimizers/sgd_op.cu
+1
-1
paddle/fluid/operators/row_conv_op.cc
paddle/fluid/operators/row_conv_op.cc
+2
-2
paddle/fluid/operators/row_conv_op.cu
paddle/fluid/operators/row_conv_op.cu
+4
-4
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+1
-1
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+1
-1
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+7
-8
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+16
-20
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+18
-19
paddle/fluid/operators/sequence_ops/sequence_expand_op.h
paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+24
-28
paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+1
-1
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+4
-4
paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+7
-8
paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
...uid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+1
-1
paddle/fluid/operators/shuffle_batch_op.h
paddle/fluid/operators/shuffle_batch_op.h
+2
-2
paddle/fluid/operators/tdm_child_op.h
paddle/fluid/operators/tdm_child_op.h
+1
-1
paddle/fluid/operators/tdm_sampler_op.h
paddle/fluid/operators/tdm_sampler_op.h
+1
-1
paddle/fluid/pybind/tensor.cc
paddle/fluid/pybind/tensor.cc
+1
-1
paddle/phi/core/CMakeLists.txt
paddle/phi/core/CMakeLists.txt
+5
-0
paddle/phi/core/mixed_vector.cc
paddle/phi/core/mixed_vector.cc
+13
-16
paddle/phi/core/mixed_vector.h
paddle/phi/core/mixed_vector.h
+27
-26
paddle/phi/kernels/cpu/edit_distance_kernel.cc
paddle/phi/kernels/cpu/edit_distance_kernel.cc
+3
-3
paddle/phi/kernels/funcs/selected_rows_functor.cc
paddle/phi/kernels/funcs/selected_rows_functor.cc
+4
-4
paddle/phi/kernels/funcs/selected_rows_functor.cu
paddle/phi/kernels/funcs/selected_rows_functor.cu
+11
-11
paddle/phi/kernels/funcs/sequence2batch.cc
paddle/phi/kernels/funcs/sequence2batch.cc
+1
-1
paddle/phi/kernels/funcs/sequence2batch.cu
paddle/phi/kernels/funcs/sequence2batch.cu
+2
-2
paddle/phi/kernels/funcs/sequence2batch.h
paddle/phi/kernels/funcs/sequence2batch.h
+1
-1
paddle/phi/kernels/funcs/sequence_scale.cu
paddle/phi/kernels/funcs/sequence_scale.cu
+1
-1
paddle/phi/kernels/gpu/adagrad_kernel.cu
paddle/phi/kernels/gpu/adagrad_kernel.cu
+2
-2
paddle/phi/kernels/gpu/edit_distance_kernel.cu
paddle/phi/kernels/gpu/edit_distance_kernel.cu
+2
-2
paddle/phi/kernels/gpu/embedding_grad_kernel.cu
paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+3
-3
paddle/phi/kernels/gpu/sgd_kernel.cu
paddle/phi/kernels/gpu/sgd_kernel.cu
+2
-2
paddle/phi/kernels/impl/momentum_kernel_impl.h
paddle/phi/kernels/impl/momentum_kernel_impl.h
+1
-1
paddle/phi/kernels/impl/rmsprop_kernel_impl.h
paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+1
-1
paddle/phi/kernels/impl/warpctc_kernel_impl.h
paddle/phi/kernels/impl/warpctc_kernel_impl.h
+3
-3
paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+1
-1
paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+1
-1
paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+1
-1
paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
...le/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
+2
-2
paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+1
-1
paddle/phi/tests/core/CMakeLists.txt
paddle/phi/tests/core/CMakeLists.txt
+17
-0
paddle/phi/tests/core/test_mixed_vector.cc
paddle/phi/tests/core/test_mixed_vector.cc
+5
-5
paddle/phi/tests/core/test_mixed_vector.cu
paddle/phi/tests/core/test_mixed_vector.cu
+14
-13
tools/parallel_UT_rule.py
tools/parallel_UT_rule.py
+2
-2
未找到文件。
paddle/fluid/distributed/collective/reducer.cc
浏览文件 @
35d7d1f0
...
...
@@ -1113,7 +1113,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
const
auto
&
rank_
=
process_group_
->
GetRank
();
const
auto
&
size_
=
process_group_
->
GetSize
();
framework
::
Vector
<
int64_t
>
rows_num_vector
(
size_
);
phi
::
Vector
<
int64_t
>
rows_num_vector
(
size_
);
rows_num_vector
[
rank_
]
=
static_cast
<
int64_t
>
(
src_rows
.
size
());
Tensor
rows_num_tensor
=
paddle
::
experimental
::
empty
(
...
...
@@ -1183,7 +1183,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
}
process_group_
->
AllGather
(
in
,
out
)
->
Synchronize
();
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
phi
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
...
...
@@ -1262,7 +1262,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
Tensor
dst_rows_tensor
=
paddle
::
experimental
::
concat
(
rows_tensors
,
phi
::
Scalar
(
0
));
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
phi
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
...
...
paddle/fluid/distributed/ps/service/brpc_utils.cc
浏览文件 @
35d7d1f0
...
...
@@ -236,7 +236,7 @@ void DeserializeLodTensor(framework::Variable* var,
framework
::
LoD
lod
;
for
(
int
i
=
0
;
i
<
msg
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
phi
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
msg
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
msg
.
lod
(
i
).
lod_data
(
j
));
}
...
...
paddle/fluid/distributed/test/brpc_utils_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -39,7 +39,7 @@ void CreateVarsOnScope(framework::Scope* scope,
auto
*
tensor1
=
var1
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor1
->
Resize
(
phi
::
make_ddim
({
512
,
8
,
4
,
2
}));
framework
::
LoD
lod1
;
lod1
.
push_back
(
framework
::
Vector
<
size_t
>
({
1
,
3
,
8
}));
lod1
.
push_back
(
phi
::
Vector
<
size_t
>
({
1
,
3
,
8
}));
tensor1
->
set_lod
(
lod1
);
tensor1
->
mutable_data
<
float
>
(
*
place
);
phi
::
funcs
::
set_constant
(
ctx
,
tensor1
,
31.9
);
...
...
@@ -49,7 +49,7 @@ void CreateVarsOnScope(framework::Scope* scope,
auto
*
tensor2
=
var2
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor2
->
Resize
(
phi
::
make_ddim
({
1000
,
64
}));
framework
::
LoD
lod2
;
lod2
.
push_back
(
framework
::
Vector
<
size_t
>
({
1
,
1
}));
lod2
.
push_back
(
phi
::
Vector
<
size_t
>
({
1
,
1
}));
tensor2
->
set_lod
(
lod2
);
tensor2
->
mutable_data
<
int
>
(
*
place
);
phi
::
funcs
::
set_constant
(
ctx
,
tensor2
,
100
);
...
...
@@ -98,7 +98,7 @@ void RunMultiVarMsg(platform::Place place) {
framework
::
Variable
*
var1
=
scope_recv
.
FindVar
(
"x1"
);
auto
*
tensor1
=
var1
->
GetMutable
<
phi
::
DenseTensor
>
();
EXPECT_EQ
(
tensor1
->
dims
(),
phi
::
make_ddim
({
512
,
8
,
4
,
2
}));
// EXPECT_EQ(tensor1->lod(),
framework
::Vector<size_t>({1, 3, 8}));
// EXPECT_EQ(tensor1->lod(),
phi
::Vector<size_t>({1, 3, 8}));
auto
*
tensor_data1
=
const_cast
<
float
*>
(
tensor1
->
data
<
float
>
());
int
tensor_numel1
=
512
*
8
*
4
*
2
;
for
(
int
i
=
0
;
i
<
tensor_numel1
;
++
i
)
...
...
@@ -108,7 +108,7 @@ void RunMultiVarMsg(platform::Place place) {
framework
::
Variable
*
var2
=
scope_recv
.
FindVar
(
"x2"
);
auto
*
tensor2
=
var2
->
GetMutable
<
phi
::
DenseTensor
>
();
EXPECT_EQ
(
tensor2
->
dims
(),
phi
::
make_ddim
({
1000
,
64
}));
// EXPECT_EQ(tensor2->lod(),
framework
::Vector<size_t>({1, 1}));
// EXPECT_EQ(tensor2->lod(),
phi
::Vector<size_t>({1, 1}));
auto
*
tensor_data2
=
const_cast
<
int
*>
(
tensor2
->
data
<
int
>
());
int
tensor_numel2
=
1000
*
64
;
for
(
int
i
=
0
;
i
<
tensor_numel2
;
++
i
)
EXPECT_EQ
(
tensor_data2
[
i
],
100
);
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
35d7d1f0
...
...
@@ -162,27 +162,7 @@ cc_test(
eigen_test
SRCS eigen_test.cc
DEPS tensor
)
cc_library
(
mixed_vector
SRCS mixed_vector.cc
DEPS device_context place memory
)
if
(
WITH_GPU
)
nv_test
(
mixed_vector_test
SRCS mixed_vector_test.cc mixed_vector_test.cu
DEPS mixed_vector place memory device_context tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
mixed_vector_test
SRCS mixed_vector_test.cc mixed_vector_test.cu
DEPS mixed_vector place memory device_context tensor
)
else
()
cc_test
(
mixed_vector_test
SRCS mixed_vector_test.cc
DEPS mixed_vector place memory device_context tensor
)
endif
()
cc_library
(
lod_tensor
SRCS lod_tensor.cc
...
...
paddle/fluid/framework/data_feed.cc
浏览文件 @
35d7d1f0
...
...
@@ -2815,7 +2815,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
LoD
&
lod
=
(
*
feed
->
mutable_lod
());
lod
.
resize
(
1
);
lod
[
0
].
resize
(
offset_cols_size
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
[
0
]);
p
hi
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
[
0
]);
memcpy
(
mixv_lod
.
MutableData
(
platform
::
CPUPlace
()),
off_start_ptr
,
offset_cols_size
*
sizeof
(
size_t
));
...
...
paddle/fluid/framework/data_type_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -18,6 +18,7 @@
#include "gtest/gtest.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/place.h"
TEST
(
DataType
,
float16
)
{
using
paddle
::
platform
::
CPUPlace
;
...
...
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
35d7d1f0
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/dlpack_tensor_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -18,6 +18,7 @@
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/eigen_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/eigen.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/ddim.h"
namespace
paddle
{
...
...
paddle/fluid/framework/fleet/heter_wrapper.cc
浏览文件 @
35d7d1f0
...
...
@@ -158,7 +158,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
LoD
lod
;
for
(
int
i
=
0
;
i
<
req_var
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
phi
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
req_var
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
req_var
.
lod
(
i
).
lod_data
(
j
));
}
...
...
@@ -203,7 +203,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
LoD
lod
;
for
(
int
i
=
0
;
i
<
req_var
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
phi
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
req_var
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
req_var
.
lod
(
i
).
lod_data
(
j
));
}
...
...
paddle/fluid/framework/lod_tensor.h
浏览文件 @
35d7d1f0
...
...
@@ -21,12 +21,12 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -54,7 +54,7 @@ void MergeLoDTensor(phi::DenseTensor* target,
* 0 2 4 7
* 0 2 5 7 10 12 15 20
*/
using
LoD
=
std
::
vector
<
Vector
<
size_t
>>
;
using
LoD
=
std
::
vector
<
phi
::
Vector
<
size_t
>>
;
std
::
string
LoDToString
(
const
LoD
&
lod
);
...
...
paddle/fluid/framework/lod_tensor_test.cu
浏览文件 @
35d7d1f0
...
...
@@ -31,7 +31,7 @@ TEST(LoD, data) {
lod
.
push_back
(
std
::
vector
<
size_t
>
({
0
,
1
,
6
,
8
,
10
,
11
}));
auto
&
v
=
lod
[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector_v
(
&
v
);
p
hi
::
MixVector
<
size_t
>
mix_vector_v
(
&
v
);
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
...
...
@@ -69,7 +69,7 @@ TEST(DenseTensor, LoDInGPU) {
EXPECT_EQ
(
lod_tensor
.
lod_element
(
0
,
4
).
first
,
8UL
);
auto
lod
=
lod_tensor
.
lod
();
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
lod
[
0
]));
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
(
lod
[
0
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
35d7d1f0
...
...
@@ -15,15 +15,15 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace
paddle
{
namespace
framework
{
using
LoD
=
std
::
vector
<
p
addle
::
framework
::
Vector
<
size_t
>>
;
using
LoD
=
std
::
vector
<
p
hi
::
Vector
<
size_t
>>
;
}
// namespace framework
}
// namespace paddle
paddle/fluid/imperative/all_reduce.cc
浏览文件 @
35d7d1f0
...
...
@@ -104,10 +104,10 @@ static void AllReduce(const phi::SelectedRows &src,
// 1. Gather rows number from all workers. Here use ncclAllGather to do this,
// but we can use other ways to implement is in the future
const
auto
&
src_rows
=
src
.
rows
();
framework
::
Vector
<
int64_t
>
rows_num_vector
(
strategy
.
nranks_
);
phi
::
Vector
<
int64_t
>
rows_num_vector
(
strategy
.
nranks_
);
rows_num_vector
[
strategy
.
local_rank_
]
=
static_cast
<
int64_t
>
(
src_rows
.
size
());
// CUDAMutableData use CalStream
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_rows_num_vector
(
&
rows_num_vector
);
p
hi
::
MixVector
<
int64_t
>
mixv_rows_num_vector
(
&
rows_num_vector
);
auto
*
gpu_rows_num_ptr
=
mixv_rows_num_vector
.
CUDAMutableData
(
place
);
VLOG
(
4
)
<<
"start dev_ctx->wait"
;
if
(
!
use_calc_stream
)
{
...
...
@@ -138,9 +138,9 @@ static void AllReduce(const phi::SelectedRows &src,
auto
*
dst_rows
=
dst
->
mutable_rows
();
dst_rows
->
resize
(
rows_num
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
auto
*
dst_rows_ptr
=
mixv_dst_rows
.
CUDAMutableData
(
place
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
const
auto
*
src_rows_ptr
=
mixv_src_rows
.
CUDAData
(
place
);
auto
*
dst_tensor
=
dst
->
mutable_value
();
...
...
paddle/fluid/imperative/gloo_context.cc
浏览文件 @
35d7d1f0
...
...
@@ -158,9 +158,9 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
<<
", height: "
<<
src
.
height
();
auto
*
dst_rows
=
dst
->
mutable_rows
();
dst_rows
->
resize
(
rows_num
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
auto
*
dst_rows_ptr
=
mixv_dst_rows
.
MutableData
(
place
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
const
int64_t
*
src_rows_ptr
=
mixv_src_rows
.
Data
(
place
);
auto
*
dst_tensor
=
dst
->
mutable_value
();
...
...
paddle/fluid/operators/assign_op_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -98,7 +98,7 @@ TEST(AssignOp, AssignSelectedRows) {
assign_functor
(
input
);
auto
&
out_selected_row
=
output
.
Get
<
phi
::
SelectedRows
>
();
const
p
addle
::
framework
::
Vector
<
int64_t
>&
out_rows
=
out_selected_row
.
rows
();
const
p
hi
::
Vector
<
int64_t
>&
out_rows
=
out_selected_row
.
rows
();
EXPECT_EQ
(
rows
.
size
(),
out_rows
.
size
());
for
(
size_t
i
=
0
;
i
<
rows
.
size
();
++
i
)
{
EXPECT_EQ
(
rows
[
i
],
out_rows
[
i
]);
...
...
paddle/fluid/operators/ctc_align_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -129,7 +129,7 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
// merge elements and delete blank
T
*
output_data
=
output
->
mutable_data
<
T
>
({
num_tokens
,
1
},
ctx
.
GetPlace
());
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_input_lod
(
&
input_lod
[
level
]);
p
hi
::
MixVector
<
size_t
>
mixv_input_lod
(
&
input_lod
[
level
]);
MergeAndDelCudaKernel
<
T
>
<<<
1
,
1
,
0
,
stream
>>>
(
num_tokens
,
tokens
,
...
...
paddle/fluid/operators/cvm_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -166,7 +166,7 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
lod
[
lod
.
size
()
-
1
],
platform
::
errors
::
PreconditionNotMet
(
"Output(X@GRAD)'s dim[0] must be equal to last element of lod"
));
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
);
p
hi
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
);
CvmGradComputeKernel
<<<
(
dx_numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
...
...
paddle/fluid/operators/detection/box_clip_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -59,7 +59,7 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
auto
stream
=
dev_ctx
.
stream
();
const
size_t
batch_size
=
lod
.
back
().
size
()
-
1
;
T
*
output_data
=
output
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
abs_offset_lod
[
0
]);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
abs_offset_lod
[
0
]);
GPUBoxClip
<
T
,
512
><<<
batch_size
,
512
,
0
,
stream
>>>
(
input
->
data
<
T
>
(),
mix_vector
.
CUDAMutableData
(
dev_ctx
.
GetPlace
()),
...
...
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -19,7 +19,6 @@ namespace cub = hipcub;
#include <paddle/fluid/memory/allocation/allocator.h>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
...
...
@@ -28,6 +27,7 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
namespace
paddle
{
...
...
paddle/fluid/operators/detection/generate_proposals_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -18,10 +18,10 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/detection/bbox_util.cu.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
...
...
paddle/fluid/operators/detection/target_assign_op.h
浏览文件 @
35d7d1f0
...
...
@@ -121,7 +121,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
auto
x_lod
=
x
->
lod
().
back
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
size_t
*
x_lod_data
=
mixv_x_lod
.
MutableData
(
ctx
.
GetPlace
());
#else
size_t
*
x_lod_data
=
x_lod
.
data
();
...
...
@@ -155,7 +155,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
const
int
*
neg_idx_data
=
neg_indices
->
data
<
int
>
();
auto
neg_lod
=
neg_indices
->
lod
().
back
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_neg_lod
(
&
neg_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_neg_lod
(
&
neg_lod
);
size_t
*
neg_lod_data
=
mixv_neg_lod
.
MutableData
(
ctx
.
GetPlace
());
#else
size_t
*
neg_lod_data
=
neg_lod
.
data
();
...
...
paddle/fluid/operators/filter_by_instag_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -30,11 +30,11 @@
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/core/mixed_vector.h"
#if defined(PADDLE_WITH_CUDA)
namespace
cg
=
cooperative_groups
;
...
...
@@ -46,7 +46,7 @@ namespace operators {
using
SelectedRows
=
phi
::
SelectedRows
;
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
#define WARP_SIZE 32
#define MAX_WARP_NUM 32
...
...
@@ -376,7 +376,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
}
const
size_t
x2_lods_size
=
x2_lods
.
size
()
-
1
;
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x2_lods
(
&
x2_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_x2_lods
(
&
x2_lods
);
size_t
*
x2_lods_data
=
mixv_x2_lods
.
CUDAMutableData
(
gpu_place
);
...
...
@@ -401,7 +401,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
}
}
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x1_lods
(
&
x1_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_x1_lods
(
&
x1_lods
);
size_t
*
x1_lods_data
=
mixv_x1_lods
.
CUDAMutableData
(
gpu_place
);
auto
*
x1_data
=
x1
->
data
<
T
>
();
...
...
@@ -433,12 +433,12 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
Vector
<
size_t
>
out_lods
(
x2_lods_size
+
1
,
0
);
Vector
<
size_t
>
map_lods
(
x2_lods_size
+
1
,
0
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_lods
(
&
out_lods
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_out_lods
(
&
out_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
// thrust::device_vector<size_t> out_idx(1);
Vector
<
size_t
>
out_idx
(
1
,
0
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_idx
(
&
out_idx
);
p
hi
::
MixVector
<
size_t
>
mixv_out_idx
(
&
out_idx
);
size_t
*
out_idx_data
=
mixv_out_idx
.
CUDAMutableData
(
gpu_place
);
size_t
*
out_lods_data
=
mixv_out_lods
.
CUDAMutableData
(
gpu_place
);
...
...
@@ -500,7 +500,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
}
else
{
Vector
<
size_t
>
map_lods
(
2
,
0
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
thrust
::
device_ptr
<
int64_t
>
map_data_ptr
(
map_data
);
map_data_ptr
[
0
]
=
0
;
...
...
paddle/fluid/operators/filter_by_instag_op.h
浏览文件 @
35d7d1f0
...
...
@@ -23,16 +23,16 @@
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
using
SelectedRows
=
phi
::
SelectedRows
;
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
template
<
typename
T
>
class
FilterByInstagKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
浏览文件 @
35d7d1f0
...
...
@@ -256,7 +256,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto
lod
=
ids
->
lod
()[
0
];
int64_t
out_width
=
d_output
->
dims
()[
1
];
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
phi
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
new_rows
->
resize
(
ids_num
);
std
::
memcpy
(
&
(
*
new_rows
)[
0
],
ids_data
,
ids_num
*
sizeof
(
int64_t
));
...
...
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -14,16 +14,16 @@
#include <string>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
#define CUDA_KERNEL_LOOP(i, n) \
for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
...
...
@@ -441,7 +441,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
int
embedding_size
=
inputs
[
0
]
->
numel
()
/
inputs
[
0
]
->
dims
()[
0
];
int
batch_size
=
-
1
;
std
::
vector
<
p
addle
::
framework
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
std
::
vector
<
p
hi
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
for
(
size_t
i
=
0
;
i
<
slot_size
;
++
i
)
{
const
auto
*
input
=
inputs
[
i
];
...
...
@@ -480,7 +480,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
}
output_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
output
,
output
->
numel
()
*
sizeof
(
T
)));
mix_lods_v
[
i
]
=
new
p
addle
::
framework
::
MixVector
<
size_t
>
(
&
lods
);
mix_lods_v
[
i
]
=
new
p
hi
::
MixVector
<
size_t
>
(
&
lods
);
lods_data
[
i
]
=
mix_lods_v
[
i
]
->
CUDAData
(
ctx
.
GetPlace
());
seqpool_outputs
[
i
].
Resize
({
batch_size
,
embedding_size
});
seqpool_output_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
...
...
@@ -527,7 +527,7 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
int
embedding_size
=
in_grads
[
0
]
->
numel
()
/
in_grads
[
0
]
->
dims
()[
0
];
int
batch_size
=
-
1
;
std
::
vector
<
p
addle
::
framework
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
std
::
vector
<
p
hi
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
for
(
size_t
i
=
0
;
i
<
slot_size
;
++
i
)
{
auto
*
in_grad
=
in_grads
[
i
];
...
...
@@ -563,7 +563,7 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
in_grads_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
in_grad
,
in_grad
->
numel
()
*
sizeof
(
T
)));
mix_lods_v
[
i
]
=
new
p
addle
::
framework
::
MixVector
<
size_t
>
(
&
lods
);
mix_lods_v
[
i
]
=
new
p
hi
::
MixVector
<
size_t
>
(
&
lods
);
lods_data
[
i
]
=
mix_lods_v
[
i
]
->
CUDAData
(
ctx
.
GetPlace
());
cvm_data
[
i
]
=
reinterpret_cast
<
const
T
*>
(
cvm
->
data
<
T
>
());
}
...
...
paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
浏览文件 @
35d7d1f0
...
...
@@ -17,11 +17,11 @@ limitations under the License. */
#include <memory>
#include "dnnl.hpp" // NOLINT
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/fused/multi_gru_op.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -678,7 +678,7 @@ class MultiGRUHandler {
const
std
::
vector
<
const
phi
::
DenseTensor
*>
biases_
;
phi
::
DenseTensor
*
hidden_
;
std
::
vector
<
dnnl
::
primitive_attr
>
attrs_
;
const
p
addle
::
framework
::
Vector
<
size_t
>&
x_lod_
;
const
p
hi
::
Vector
<
size_t
>&
x_lod_
;
};
template
<
typename
T
>
...
...
paddle/fluid/operators/gru_op.cc
浏览文件 @
35d7d1f0
...
...
@@ -372,7 +372,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
phi
::
DenseTensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
35d7d1f0
...
...
@@ -75,7 +75,7 @@ class GRUKernel : public framework::OpKernel<T> {
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
phi
::
DenseTensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
...
...
paddle/fluid/operators/gru_op.h
浏览文件 @
35d7d1f0
...
...
@@ -28,7 +28,7 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
phi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
indexed_src
)
{
phi
::
funcs
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
...
...
@@ -79,7 +79,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
phi
::
DenseTensor
ordered_h0
,
ordered_h0_grad
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
...
...
paddle/fluid/operators/lookup_table_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -169,12 +169,12 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto
stream
=
dev_ctx
.
stream
();
// copy GPU memory to CPU pinned memory
framework
::
Vector
<
int64_t
>
new_rows
;
phi
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_num
);
auto
gpu_place
=
context
.
GetPlace
();
// TODO(yuyang18): Strange code here.
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
memory
::
Copy
(
gpu_place
,
mixv_new_rows
.
CUDAMutableData
(
context
.
GetPlace
()),
gpu_place
,
...
...
paddle/fluid/operators/lookup_table_v2_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -159,11 +159,11 @@ struct LookupTableV2GradCUDAFunctor {
dim3
threads
(
128
,
8
);
dim3
grids
(
8
,
1
);
auto
stream
=
dev_ctx
.
stream
();
framework
::
Vector
<
int64_t
>
new_rows
;
phi
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_num
);
auto
gpu_place
=
context_
.
GetPlace
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
if
(
!
std
::
is_same
<
IdT
,
int64_t
>::
value
)
{
InputTypeConvert
<<<
grids
,
threads
,
0
,
stream
>>>
(
ids_data
,
ids_num
,
mixv_new_rows
.
MutableData
(
gpu_place
));
...
...
paddle/fluid/operators/lstm_op.h
浏览文件 @
35d7d1f0
...
...
@@ -27,7 +27,7 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
phi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
indexed_src
)
{
phi
::
funcs
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
...
...
@@ -95,7 +95,7 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value
.
prev_state_value
=
nullptr
;
phi
::
DenseTensor
ordered_c0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
cell_t0
)
{
// Since the batch computing for LSTM reorders the input sequence
...
...
@@ -236,7 +236,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization.
phi
::
DenseTensor
ordered_h0
,
ordered_c0
,
ordered_h0_g
,
ordered_c0_g
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
c0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
...
...
paddle/fluid/operators/lstmp_op.h
浏览文件 @
35d7d1f0
...
...
@@ -70,7 +70,7 @@ class _ClipGradFunctor {
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
src
,
framework
::
Vector
<
size_t
>
index
,
phi
::
Vector
<
size_t
>
index
,
phi
::
DenseTensor
*
dst
,
bool
indexed_src
)
{
phi
::
funcs
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
...
...
@@ -158,7 +158,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
phi
::
DenseTensor
ordered_c0
;
phi
::
DenseTensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
cell_t0
)
{
// Since the batch computing for LSTMP reorders the input sequence
...
...
@@ -350,7 +350,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
// initialization.
phi
::
DenseTensor
ordered_h0
,
ordered_c0
,
ordered_h0_g
,
ordered_c0_g
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
c0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
...
...
paddle/fluid/operators/math/beam_search.cu
浏览文件 @
35d7d1f0
...
...
@@ -446,8 +446,8 @@ class BeamSearchFunctor<phi::GPUContext, T> {
framework
::
LoD
selected_lod
(
2
);
selected_lod
[
0
].
assign
(
abs_lod
[
level
].
begin
(),
abs_lod
[
level
].
end
());
selected_lod
[
1
].
resize
(
scores
->
dims
()[
0
]
+
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
selected_lod
[
1
]);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_abs
(
&
abs_lod
[
level
]);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
selected_lod
[
1
]);
p
hi
::
MixVector
<
size_t
>
mixv_abs
(
&
abs_lod
[
level
]);
size_t
*
selected_offsets
=
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
());
if
(
num_seqs
==
1
)
{
...
...
paddle/fluid/operators/math/sequence_padding.cc
浏览文件 @
35d7d1f0
...
...
@@ -28,7 +28,7 @@ namespace math {
template
<
typename
T
>
void
CopyValidData
(
phi
::
DenseTensor
*
dst_tensor
,
const
phi
::
DenseTensor
*
src_tensor
,
const
framework
::
Vector
<
size_t
>&
seq_offsets
,
const
phi
::
Vector
<
size_t
>&
seq_offsets
,
int
pad_seq_len
,
int
step_width
,
bool
norm_by_len
,
...
...
paddle/fluid/operators/math/sequence_padding.cu
浏览文件 @
35d7d1f0
...
...
@@ -124,7 +124,7 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
p
hi
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kSeqToPad
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
pad_data
,
seq_data
,
...
...
@@ -191,7 +191,7 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
const
T
*
pad_data
=
pad_tensor
.
data
<
T
>
();
T
*
seq_data
=
seq_tensor
->
data
<
T
>
();
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
p
hi
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kPadToSeq
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
seq_data
,
pad_data
,
...
...
paddle/fluid/operators/math/sequence_padding.h
浏览文件 @
35d7d1f0
...
...
@@ -29,7 +29,7 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
enum
CopyType
{
kSeqToPad
,
kPadToSeq
};
inline
static
size_t
MaximumSequenceLength
(
const
framework
::
Vector
<
size_t
>&
seq_offset
)
{
const
phi
::
Vector
<
size_t
>&
seq_offset
)
{
size_t
seq_num
=
seq_offset
.
size
()
-
1
;
size_t
max_seq_len
=
0
;
for
(
size_t
i
=
0
;
i
<
seq_num
;
++
i
)
{
...
...
@@ -39,7 +39,7 @@ inline static size_t MaximumSequenceLength(
}
inline
static
size_t
TotalSequenceLength
(
const
framework
::
Vector
<
size_t
>&
seq_offset
)
{
const
phi
::
Vector
<
size_t
>&
seq_offset
)
{
size_t
seq_num
=
seq_offset
.
size
()
-
1
;
size_t
total_seq_len
=
0
;
for
(
size_t
i
=
0
;
i
<
seq_num
;
++
i
)
{
...
...
@@ -50,7 +50,7 @@ inline static size_t TotalSequenceLength(
inline
static
void
CheckDims
(
const
framework
::
DDim
&
seq_tensor_dims
,
const
framework
::
DDim
&
pad_tensor_dims
,
const
framework
::
Vector
<
size_t
>&
seq_offset
,
const
phi
::
Vector
<
size_t
>&
seq_offset
,
int64_t
padded_seq_len
,
int64_t
step_width
,
const
PadLayout
&
layout
)
{
...
...
paddle/fluid/operators/math/sequence_pooling.cu
浏览文件 @
35d7d1f0
...
...
@@ -203,7 +203,7 @@ class SequencePoolFunctor<phi::GPUContext, T> {
const
size_t
item_dim
=
output
->
numel
()
/
output
->
dims
()[
0
];
dim3
threads
(
1024
,
1
);
dim3
grid
(
std
::
max
(
static_cast
<
int
>
(
lod
.
size
())
-
1
,
1
),
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
if
(
pooltype
==
"MAX"
)
{
sequence_pool_kernel
<
T
,
MaxPoolFunctor
<
T
>>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
...
...
@@ -421,7 +421,7 @@ class SequencePoolGradFunctor<phi::GPUContext, T> {
const
size_t
item_dim
=
in_grad
->
numel
()
/
in_grad
->
dims
()[
0
];
dim3
threads
(
1024
,
1
);
dim3
grid
(
std
::
max
(
static_cast
<
int
>
(
lod
.
size
())
-
1
,
1
),
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
if
(
pooltype
==
"MAX"
)
{
sequence_pool_grad_kernel
<
T
,
MaxPoolGradFunctor
<
T
>>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
...
...
paddle/fluid/operators/optimizers/ftrl_op.h
浏览文件 @
35d7d1f0
...
...
@@ -197,7 +197,7 @@ class FTRLOpKernel : public framework::OpKernel<T> {
ctx
.
template
device_context
<
DeviceContext
>(),
*
grad
,
merged_grad
);
auto
*
merged_rows
=
merged_grad
->
mutable_rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_merged_rows
(
merged_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_merged_rows
(
merged_rows
);
const
int64_t
*
rows
=
mixv_merged_rows
.
Data
(
ctx
.
GetPlace
());
auto
row_numel
=
static_cast
<
int64_t
>
(
merged_grad
->
value
().
dims
()[
1
]);
auto
row_height
=
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
());
...
...
paddle/fluid/operators/optimizers/sgd_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -164,7 +164,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
int
thread_x
=
kThreadsPerBlock
;
int
max_threads
=
ctx
.
cuda_device_context
().
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
SparseSGDFunctorKernel
<<<
max_blocks
,
thread_x
,
0
,
...
...
paddle/fluid/operators/row_conv_op.cc
浏览文件 @
35d7d1f0
...
...
@@ -153,7 +153,7 @@ class RowConvKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
}
else
{
batch_size
=
x
->
lod
()[
0
].
size
()
-
1
;
}
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
input_dim
=
0
;
int
timesteps
=
0
;
if
(
is_tensor
)
{
...
...
@@ -231,7 +231,7 @@ class RowConvGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
}
else
{
batch_size
=
x
->
lod
()[
0
].
size
()
-
1
;
}
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
timesteps
=
0
;
int
input_dim
=
0
;
if
(
is_tensor
)
{
...
...
paddle/fluid/operators/row_conv_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -338,7 +338,7 @@ class RowConvKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
batch_size
=
X
->
lod
()[
0
].
size
()
-
1
;
}
int
input_dim
=
0
;
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
timesteps
=
X
->
dims
()[
1
];
if
(
is_tensor
)
{
for
(
int
i
=
0
;
i
<
batch_size
+
1
;
i
++
)
{
...
...
@@ -352,7 +352,7 @@ class RowConvKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
int
num_sequence
=
batch_indices
.
size
()
-
1
;
int
future_context
=
Filter
->
dims
()[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
batch_indices
);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
batch_indices
);
size_t
*
idx
=
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
());
auto
stream
=
context
.
cuda_device_context
().
stream
();
...
...
@@ -397,7 +397,7 @@ class RowConvGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
}
int
input_dim
=
0
;
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
timesteps
=
X
->
dims
()[
1
];
if
(
is_tensor
)
{
for
(
int
i
=
0
;
i
<
batch_size
+
1
;
i
++
)
{
...
...
@@ -411,7 +411,7 @@ class RowConvGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
// int input_dim = X->dims()[1];
int
num_sequence
=
batch_indices
.
size
()
-
1
;
int
future_context
=
Filter
->
dims
()[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_batch_indices
(
&
batch_indices
);
p
hi
::
MixVector
<
size_t
>
mixv_batch_indices
(
&
batch_indices
);
size_t
*
idx
=
mixv_batch_indices
.
CUDAMutableData
(
context
.
GetPlace
());
auto
&
device_ctx
=
context
.
cuda_device_context
();
...
...
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -76,7 +76,7 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
out
->
Resize
({
in_dims
[
0
],
win_size
});
auto
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
// Copy LoD to GPU
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_lod0
(
&
lod0
);
p
hi
::
MixVector
<
size_t
>
mixv_lod0
(
&
lod0
);
const
size_t
*
dev_in_lod_ptr
=
mixv_lod0
.
CUDAData
(
context
.
GetPlace
());
// Calc output tensor
CalcOutPut
<<<
(
in_len
-
1
)
/
PADDLE_CUDA_NUM_THREADS
+
1
,
...
...
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -97,7 +97,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
// Copy LoD to GPU
auto
last_lod
=
lod
[
lod
.
size
()
-
1
];
auto
lod_len
=
last_lod
.
size
();
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_last_lod
(
&
last_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_last_lod
(
&
last_lod
);
const
size_t
*
dev_in_lod_ptr
=
mixv_last_lod
.
CUDAData
(
ctx
.
GetPlace
());
// Calc output LoD
thrust
::
device_vector
<
size_t
>
dev_out_lod
(
lod_len
);
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -65,10 +65,9 @@ static __global__ void sequence_expand_as_grad_kernel(
template
<
typename
T
>
struct
SequenceExpandAsFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
int
height
=
x
.
dims
()[
0
];
int
width
=
phi
::
product
(
x
.
dims
())
/
height
;
...
...
@@ -84,7 +83,7 @@ struct SequenceExpandAsFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
block_x
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_expand_as_kernel
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
x
.
data
<
T
>
(),
mixv_ref_lod
.
CUDAData
(
context
.
GetPlace
()),
...
...
@@ -98,7 +97,7 @@ template <typename T>
struct
SequenceExpandAsGradFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand based lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand based lod*/
phi
::
DenseTensor
*
dx
)
{
int
height
=
dx
->
dims
()[
0
];
int
width
=
phi
::
product
(
dx
->
dims
())
/
height
;
...
...
@@ -114,7 +113,7 @@ struct SequenceExpandAsGradFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
block_x
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_expand_as_grad_kernel
<<<
grid_size
,
block_size
,
0
,
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
浏览文件 @
35d7d1f0
...
...
@@ -26,28 +26,25 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandAsFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
};
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandAsGradFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
);
};
template
<
typename
T
>
struct
SequenceExpandAsFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
int64_t
height
=
x
.
dims
()[
0
];
int64_t
width
=
phi
::
product
(
x
.
dims
())
/
height
;
...
...
@@ -122,10 +119,9 @@ class SequenceExpandAsKernel : public framework::OpKernel<T> {
* */
template
<
typename
T
>
struct
SequenceExpandAsGradFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
)
{
int64_t
height
=
dx
->
dims
()[
0
];
int64_t
width
=
phi
::
product
(
dx
->
dims
())
/
height
;
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -82,9 +82,9 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
}
}
void
GetOutputOffset
(
const
framework
::
Vector
<
size_t
>&
x_lod
,
const
framework
::
Vector
<
size_t
>&
ref_lod
,
framework
::
Vector
<
size_t
>*
out_offset
)
{
void
GetOutputOffset
(
const
phi
::
Vector
<
size_t
>&
x_lod
,
const
phi
::
Vector
<
size_t
>&
ref_lod
,
phi
::
Vector
<
size_t
>*
out_offset
)
{
size_t
offset
=
0
;
int
lod_size
=
static_cast
<
int
>
(
x_lod
.
size
());
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
x_lod
.
size
());
++
i
)
{
...
...
@@ -99,8 +99,8 @@ template <typename T>
static
int
ExpandByMemoryCopy
(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
x
,
LoDTensor
*
out
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
const
framework
::
Vector
<
size_t
>&
ref_lod
,
const
phi
::
Vector
<
size_t
>&
x_lod
,
const
phi
::
Vector
<
size_t
>&
ref_lod
,
bool
do_copy
)
{
auto
out_data
=
out
->
data
<
T
>
();
auto
x_data
=
x
.
data
<
T
>
();
...
...
@@ -143,11 +143,10 @@ static int ExpandByMemoryCopy(const phi::GPUContext& context,
template
<
typename
T
>
struct
SequenceExpandFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
out
)
{
int
num_copys
=
ExpandByMemoryCopy
<
T
>
(
context
,
x
,
out
,
x_lod
,
ref_lod
,
false
);
...
...
@@ -157,7 +156,7 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
}
else
{
int
x_item_length
=
x
.
numel
()
/
x
.
dims
()[
0
];
size_t
x_lod_size
=
x_lod
.
size
();
framework
::
Vector
<
size_t
>
out_offset
(
x_lod_size
*
2
+
ref_lod
.
size
());
phi
::
Vector
<
size_t
>
out_offset
(
x_lod_size
*
2
+
ref_lod
.
size
());
GetOutputOffset
(
x_lod
,
ref_lod
,
&
out_offset
);
for
(
size_t
i
=
0
;
i
<
x_lod_size
;
++
i
)
{
...
...
@@ -167,7 +166,7 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
out_offset
[
2
*
x_lod_size
+
i
]
=
ref_lod
[
i
];
}
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
p
hi
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
const
size_t
*
out_offset_data
=
mixv_out_offset
.
CUDAData
(
context
.
GetPlace
());
const
size_t
*
x_lod_data
=
out_offset_data
+
x_lod_size
;
...
...
@@ -197,11 +196,11 @@ template <typename T>
struct
SequenceExpandGradFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand based lod*/
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand based lod*/
LoDTensor
*
dx
)
{
int
x_item_length
=
phi
::
product
(
dx
->
dims
())
/
dx
->
dims
()[
0
];
framework
::
Vector
<
size_t
>
out_offset
(
x_lod
.
size
());
phi
::
Vector
<
size_t
>
out_offset
(
x_lod
.
size
());
GetOutputOffset
(
x_lod
,
ref_lod
,
&
out_offset
);
int
thread_x
=
std
::
min
(
32
,
std
::
max
(
static_cast
<
int
>
(
ref_lod
.
size
()),
16
));
...
...
@@ -210,9 +209,9 @@ struct SequenceExpandGradFunctor<phi::GPUContext, T> {
int
block_x
=
static_cast
<
int
>
(
ref_lod
.
size
());
dim3
block_size
(
thread_x
,
thread_y
,
thread_z
);
dim3
grid_size
(
block_x
,
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
sequence_expand_grad_kernel
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
dout
.
data
<
T
>
(),
mixv_ref_lod
.
CUDAData
(
context
.
GetPlace
()),
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_op.h
浏览文件 @
35d7d1f0
...
...
@@ -29,31 +29,28 @@ using EigenMatrix = phi::EigenMatrix<T, MajorType, IndexType>;
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
};
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandGradFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
);
};
template
<
typename
T
>
struct
SequenceExpandFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
int
out_offset
=
0
;
int
x_item_length
=
x
.
numel
()
/
x
.
dims
()[
0
];
...
...
@@ -112,7 +109,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
}
// x lod level is at most 1.
framework
::
Vector
<
size_t
>
out_lod
;
phi
::
Vector
<
size_t
>
out_lod
;
if
(
x_lod
.
size
()
==
1
)
{
out_lod
.
push_back
(
0
);
int
out_offset
=
0
;
...
...
@@ -130,7 +127,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
auto
&
ref_lod
=
*
out
->
mutable_lod
();
ref_lod
[
0
]
=
out_lod
;
}
framework
::
Vector
<
size_t
>
ref_x_lod
;
phi
::
Vector
<
size_t
>
ref_x_lod
;
if
(
x
->
lod
().
size
()
==
1
)
{
ref_x_lod
=
x
->
lod
()[
0
];
}
else
{
...
...
@@ -161,11 +158,10 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
* */
template
<
typename
T
>
struct
SequenceExpandGradFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
)
{
int
dout_offset
=
0
;
for
(
size_t
i
=
1
;
i
<
ref_lod
.
size
();
++
i
)
{
...
...
@@ -214,8 +210,8 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
return
;
}
framework
::
Vector
<
size_t
>
ref_x_lod
;
framework
::
Vector
<
size_t
>
ref_lod
=
y_lod
[
ref_level
];
phi
::
Vector
<
size_t
>
ref_x_lod
;
phi
::
Vector
<
size_t
>
ref_lod
=
y_lod
[
ref_level
];
if
(
x
->
lod
().
size
()
==
1
)
{
ref_x_lod
=
x
->
lod
()[
0
];
}
else
{
...
...
paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
浏览文件 @
35d7d1f0
...
...
@@ -139,7 +139,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
xlod
=
x
.
lod
()[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_xlod
(
&
xlod
);
p
hi
::
MixVector
<
size_t
>
mixv_xlod
(
&
xlod
);
lod
=
mixv_xlod
.
CUDAData
(
ctx
.
GetPlace
());
}
else
{
#endif
...
...
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -124,7 +124,7 @@ template <typename T>
struct
SequenceSoftmaxFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
LoDTensor
*
out
)
{
int
height
=
ref_lod
.
size
()
-
1
;
...
...
@@ -135,7 +135,7 @@ struct SequenceSoftmaxFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
max_blocks
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_softmax_kernel
<
T
,
kThreadsPerBlock
>
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
x
.
data
<
T
>
(),
...
...
@@ -150,7 +150,7 @@ struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
dout
,
const
LoDTensor
&
out
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
LoDTensor
*
dx
)
{
size_t
height
=
ref_lod
.
size
()
-
1
;
...
...
@@ -162,7 +162,7 @@ struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
max_blocks
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_softmax_grad_kernel
<
T
,
kThreadsPerBlock
>
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
dout
.
data
<
T
>
(),
...
...
paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
浏览文件 @
35d7d1f0
...
...
@@ -21,10 +21,9 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceSoftmaxFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
};
...
...
@@ -33,7 +32,7 @@ struct SequenceSoftmaxGradFunctor {
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
DenseTensor
&
out
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
phi
::
DenseTensor
*
dx
);
};
...
...
@@ -41,7 +40,7 @@ template <typename T>
struct
SequenceSoftmaxFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
phi
::
DenseTensor
*
out
)
{
size_t
height
=
ref_lod
.
size
()
-
1
;
const
T
*
in_data
=
x
.
data
<
T
>
();
...
...
@@ -64,7 +63,7 @@ struct SequenceSoftmaxGradFunctor<phi::CPUContext, T> {
void
operator
()(
const
phi
::
CPUContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
DenseTensor
&
out
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
phi
::
DenseTensor
*
dx
)
{
size_t
height
=
ref_lod
.
size
()
-
1
;
...
...
paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
浏览文件 @
35d7d1f0
...
...
@@ -116,7 +116,7 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
auto
pos_data
=
pos
->
mutable_data
<
int
>
(
context
.
GetPlace
());
int
offset
=
0
;
framework
::
Vector
<
size_t
>
vec_out_lod
;
phi
::
Vector
<
size_t
>
vec_out_lod
;
vec_out_lod
.
reserve
(
batch_size
+
1
);
for
(
int
i
=
0
;
i
<=
batch_size
;
++
i
)
{
offset
=
row_lod
[
i
];
...
...
paddle/fluid/operators/shuffle_batch_op.h
浏览文件 @
35d7d1f0
...
...
@@ -25,16 +25,16 @@
#include "glog/logging.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
template
<
typename
T
>
class
ShuffleBatchKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
paddle/fluid/operators/tdm_child_op.h
浏览文件 @
35d7d1f0
...
...
@@ -22,8 +22,8 @@
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/tdm_sampler_op.h
浏览文件 @
35d7d1f0
...
...
@@ -22,9 +22,9 @@
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/pybind/tensor.cc
浏览文件 @
35d7d1f0
...
...
@@ -1095,7 +1095,7 @@ void BindTensor(pybind11::module &m) { // NOLINT
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self
.
set_rows
(
rows
);
#else
V
ector
<
int64_t
>
new_rows
(
rows
);
std
::
v
ector
<
int64_t
>
new_rows
(
rows
);
self
.
set_rows
(
new_rows
);
#endif
})
...
...
paddle/phi/core/CMakeLists.txt
浏览文件 @
35d7d1f0
...
...
@@ -114,6 +114,11 @@ cc_library(
SRCS custom_kernel.cc
DEPS kernel_factory
)
cc_library
(
mixed_vector
SRCS mixed_vector.cc
DEPS device_context place memory
)
# Will remove once we implemented MKLDNN_Tensor
if
(
WITH_MKLDNN
)
add_dependencies
(
dense_tensor mkldnn
)
...
...
paddle/
fluid/framework
/mixed_vector.cc
→
paddle/
phi/core
/mixed_vector.cc
浏览文件 @
35d7d1f0
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/
fluid/framework
/mixed_vector.h"
#include "paddle/
phi/core
/mixed_vector.h"
#include <algorithm>
#include <initializer_list>
...
...
@@ -22,28 +22,26 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/
fluid/platform/device
_context.h"
#include "paddle/
phi/backends/all
_context.h"
#include "paddle/utils/none.h"
#include "paddle/utils/optional.h"
namespace
paddle
{
namespace
framework
{
namespace
phi
{
template
<
typename
T
>
void
CopyToCPUHelper
(
std
::
vector
<
T
>
*
cpu_
,
p
addle
::
memory
::
AllocationPtr
*
gpu_
,
p
hi
::
Allocator
::
AllocationPtr
*
gpu_
,
size_t
*
gpu_memory_size_
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// COPY GPU Data To CPU
auto
*
dev_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
p
latform
::
DeviceContextPool
::
Instance
().
Get
((
*
gpu_
)
->
place
()));
p
hi
::
DeviceContextPool
::
Instance
().
Get
((
*
gpu_
)
->
place
()));
auto
stream
=
dev_ctx
->
stream
();
void
*
src
=
(
*
gpu_
)
->
ptr
();
void
*
dst
=
cpu_
->
data
();
paddle
::
memory
::
Copy
(
p
latform
::
CPUPlace
(),
paddle
::
memory
::
Copy
(
p
hi
::
CPUPlace
(),
dst
,
OptionalCUDAPlace
(
*
gpu_
).
get
(),
src
,
...
...
@@ -55,20 +53,20 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
template
<
typename
T
>
void
CopyCPUDataToCUDAHelper
(
std
::
vector
<
T
>
*
cpu_
,
p
addle
::
memory
::
AllocationPtr
*
gpu_
,
p
hi
::
Allocator
::
AllocationPtr
*
gpu_
,
size_t
*
gpu_memory_size_
,
const
p
latform
::
Place
&
place
)
{
const
p
hi
::
Place
&
place
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
*
src
=
cpu_
->
data
();
*
gpu_memory_size_
=
cpu_
->
size
()
*
sizeof
(
T
);
// sizeof(T)
(
*
gpu_
)
=
memory
::
Alloc
(
place
,
*
gpu_memory_size_
);
(
*
gpu_
)
=
paddle
::
memory
::
Alloc
(
place
,
*
gpu_memory_size_
);
void
*
dst
=
(
*
gpu_
)
->
ptr
();
auto
*
dev_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
p
latform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
p
hi
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
stream
=
dev_ctx
->
stream
();
paddle
::
memory
::
Copy
(
OptionalCUDAPlace
(
*
gpu_
).
get
(),
dst
,
p
latform
::
CPUPlace
(),
p
hi
::
CPUPlace
(),
src
,
*
gpu_memory_size_
,
stream
);
...
...
@@ -84,7 +82,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
\
template <> \
void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \
const p
latform::Place &place) const {
\
const p
hi::Place &place) const {
\
CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
}
...
...
@@ -92,5 +90,4 @@ INSTANTIATE_VECTOR_FOR_TYPE(size_t)
INSTANTIATE_VECTOR_FOR_TYPE
(
int
)
INSTANTIATE_VECTOR_FOR_TYPE
(
int64_t
)
};
// namespace framework
}
// namespace paddle
};
// namespace phi
paddle/
fluid/framework
/mixed_vector.h
→
paddle/
phi/core
/mixed_vector.h
浏览文件 @
35d7d1f0
...
...
@@ -22,20 +22,22 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#include "paddle/utils/none.h"
#include "paddle/utils/optional.h"
namespace
paddle
{
namespace
framework
{
namespace
phi
{
template
<
class
T
>
using
Vector
=
std
::
vector
<
T
>
;
inline
paddle
::
optional
<
p
latform
::
CUDA
Place
>
OptionalCUDAPlace
(
const
p
addle
::
memory
::
allocation
::
AllocationPtr
&
gpu_
)
{
inline
paddle
::
optional
<
p
hi
::
GPU
Place
>
OptionalCUDAPlace
(
const
p
hi
::
Allocator
::
AllocationPtr
&
gpu_
)
{
return
gpu_
==
nullptr
?
paddle
::
none
:
paddle
::
optional
<
p
latform
::
CUDA
Place
>
(
gpu_
->
place
());
:
paddle
::
optional
<
p
hi
::
GPU
Place
>
(
gpu_
->
place
());
}
// Vector<T> implements the std::vector interface, and can get Data or
...
...
@@ -146,18 +148,18 @@ class MixVector {
}
// get cuda ptr. immutable
const
T
*
CUDAData
(
p
latform
::
Place
place
)
const
{
const
T
*
CUDAData
(
p
hi
::
Place
place
)
const
{
PADDLE_ENFORCE_EQ
(
pla
tform
::
is_gpu_place
(
place
)
,
pla
ce
.
GetType
()
==
phi
::
AllocationType
::
GPU
,
true
,
p
latform
::
errors
::
Unavailable
(
p
hi
::
errors
::
Unavailable
(
"Place mismatch, CUDA Data must be on CUDA place."
));
ImmutableCUDA
(
place
);
return
reinterpret_cast
<
T
*>
(
gpu_
->
ptr
());
}
// get cuda ptr. mutable
T
*
CUDAMutableData
(
p
latform
::
Place
place
)
{
T
*
CUDAMutableData
(
p
hi
::
Place
place
)
{
const
T
*
ptr
=
CUDAData
(
place
);
flag_
=
kDirty
|
kDataInCUDA
;
return
const_cast
<
T
*>
(
ptr
);
...
...
@@ -178,7 +180,7 @@ class MixVector {
std
::
mutex
&
Mutex
()
const
{
return
mtx_
;
}
paddle
::
optional
<
p
latform
::
CUDA
Place
>
CUDAPlace
()
const
{
paddle
::
optional
<
p
hi
::
GPU
Place
>
CUDAPlace
()
const
{
return
OptionalCUDAPlace
(
gpu_
);
}
...
...
@@ -199,7 +201,7 @@ class MixVector {
void
CopyToCPU
()
const
;
void
ImmutableCUDA
(
p
latform
::
Place
place
)
const
{
void
ImmutableCUDA
(
p
hi
::
Place
place
)
const
{
if
(
IsDirty
())
{
if
(
IsInCPU
())
{
CopyCPUDataToCUDA
(
place
);
...
...
@@ -207,7 +209,7 @@ class MixVector {
SetFlag
(
kDataInCUDA
);
}
else
if
(
IsInCUDA
()
&&
!
(
place
==
gpu_
->
place
()))
{
PADDLE_THROW
(
p
latform
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
p
hi
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
// Still dirty
}
else
{
// Dirty && DataInCUDA && Device is same
...
...
@@ -220,7 +222,7 @@ class MixVector {
SetFlag
(
kDataInCUDA
);
}
else
if
(
!
(
place
==
gpu_
->
place
()))
{
PADDLE_THROW
(
p
latform
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
p
hi
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
}
else
{
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
...
...
@@ -228,7 +230,7 @@ class MixVector {
}
}
void
CopyCPUDataToCUDA
(
const
p
latform
::
Place
&
place
)
const
;
void
CopyCPUDataToCUDA
(
const
p
hi
::
Place
&
place
)
const
;
void
ImmutableCPU
()
const
{
if
(
IsDirty
()
&&
!
IsInCPU
())
{
// If data has been changed in CUDA, or
...
...
@@ -249,7 +251,7 @@ class MixVector {
bool
IsInCPU
()
const
{
return
flag_
&
kDataInCPU
;
}
std
::
vector
<
T
>
*
cpu_
;
mutable
p
addle
::
memory
::
allocation
::
AllocationPtr
gpu_
;
mutable
p
hi
::
Allocator
::
AllocationPtr
gpu_
;
mutable
size_t
gpu_memory_size_
{
0
};
mutable
int
flag_
;
...
...
@@ -332,9 +334,9 @@ class MixVector {
}
// get cuda ptr. immutable
const
T
*
CUDAData
(
p
latform
::
Place
place
)
const
{
const
T
*
CUDAData
(
p
hi
::
Place
place
)
const
{
{
p
latform
::
CUDA
Place
p
(
place
.
GetDeviceId
());
p
hi
::
GPU
Place
p
(
place
.
GetDeviceId
());
auto
&
mtx
=
m_
->
Mutex
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx
);
auto
cuda_place
=
m_
->
CUDAPlace
();
...
...
@@ -348,9 +350,9 @@ class MixVector {
}
// get cuda ptr. mutable
T
*
CUDAMutableData
(
p
latform
::
Place
place
)
{
T
*
CUDAMutableData
(
p
hi
::
Place
place
)
{
{
p
latform
::
CUDA
Place
p
(
place
.
GetDeviceId
());
p
hi
::
GPU
Place
p
(
place
.
GetDeviceId
());
auto
&
mtx
=
m_
->
Mutex
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx
);
auto
cuda_place
=
m_
->
CUDAPlace
();
...
...
@@ -372,8 +374,8 @@ class MixVector {
void
reserve
(
size_t
size
)
{
m_
->
reserve
(
size
);
}
// the unify method to access CPU or CUDA data. immutable.
const
T
*
Data
(
p
latform
::
Place
place
)
const
{
if
(
pla
tform
::
is_gpu_place
(
place
)
)
{
const
T
*
Data
(
p
hi
::
Place
place
)
const
{
if
(
pla
ce
.
GetType
()
==
phi
::
AllocationType
::
GPU
)
{
return
CUDAData
(
place
);
}
else
{
return
data
();
...
...
@@ -381,8 +383,8 @@ class MixVector {
}
// the unify method to access CPU or CUDA data. mutable.
T
*
MutableData
(
p
latform
::
Place
place
)
{
if
(
pla
tform
::
is_gpu_place
(
place
)
)
{
T
*
MutableData
(
p
hi
::
Place
place
)
{
if
(
pla
ce
.
GetType
()
==
phi
::
AllocationType
::
GPU
)
{
return
CUDAMutableData
(
place
);
}
else
{
return
data
();
...
...
@@ -397,5 +399,4 @@ class MixVector {
mutable
std
::
unique_ptr
<
VectorData
>
m_
;
};
};
// namespace framework
}
// namespace paddle
};
// namespace phi
paddle/phi/kernels/cpu/edit_distance_kernel.cc
浏览文件 @
35d7d1f0
...
...
@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/edit_distance_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace
phi
{
...
...
@@ -34,8 +34,8 @@ void EditDistanceKernel(const Context& ctx,
int64_t
*
seq_num_data
=
ctx
.
template
Alloc
<
int64_t
>(
sequencenum
);
auto
batch_size
=
hyps
.
dims
()[
0
];
p
addle
::
framework
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
addle
::
framework
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
bool
use_length
=
hypslength
.
get_ptr
()
!=
nullptr
;
...
...
paddle/phi/kernels/funcs/selected_rows_functor.cc
浏览文件 @
35d7d1f0
...
...
@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/core/mixed_vector.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/phi/backends/onednn/axpy_handler.h"
...
...
@@ -200,7 +200,7 @@ struct SelectedRowsAddTo<phi::CPUContext, T> {
auto
*
in2_value
=
input2
->
mutable_value
();
// concat rows
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
mixv_in2_rows
.
Extend
(
in1_rows
.
begin
(),
in1_rows
.
end
());
auto
in1_place
=
input1
.
place
();
...
...
@@ -254,7 +254,7 @@ struct SelectedRowsSumTo<phi::CPUContext, T> {
std
::
vector
<
int64_t
>
in2_rows
;
in2_rows
.
reserve
(
in2_rows
.
size
()
+
size
);
for
(
auto
iter
=
input1
.
begin
();
iter
!=
input1
.
end
();
++
iter
)
{
const
p
addle
::
framework
::
Vector
<
int64_t
>&
in_rows
=
(
*
iter
)
->
rows
();
const
p
hi
::
Vector
<
int64_t
>&
in_rows
=
(
*
iter
)
->
rows
();
in2_rows
.
insert
(
in2_rows
.
end
(),
in_rows
.
begin
(),
in_rows
.
end
());
}
input2
->
set_rows
(
in2_rows
);
...
...
@@ -646,7 +646,7 @@ struct MergeAdd<phi::XPUContext, T> {
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
=
false
)
{
p
addle
::
framework
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
p
hi
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
if
(
input_rows
.
size
()
==
0
)
{
return
;
}
...
...
paddle/phi/kernels/funcs/selected_rows_functor.cu
浏览文件 @
35d7d1f0
...
...
@@ -40,7 +40,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
input2
.
height
()));
output
->
set_height
(
in1_height
);
p
addle
::
framework
::
Vector
<
int64_t
>
in1_rows
(
input1
.
rows
());
p
hi
::
Vector
<
int64_t
>
in1_rows
(
input1
.
rows
());
auto
&
in2_rows
=
input2
.
rows
();
std
::
vector
<
int64_t
>
out_rows
;
out_rows
.
reserve
(
in1_rows
.
size
()
+
in2_rows
.
size
());
...
...
@@ -189,7 +189,7 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
...
...
@@ -231,7 +231,7 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
auto
*
in2_value
=
input2
->
mutable_value
();
// concat rows
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
if
(
in1_rows
.
size
())
{
mixv_in2_rows
.
Extend
(
in1_rows
.
begin
(),
in1_rows
.
end
());
}
...
...
@@ -318,7 +318,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddToTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
...
...
@@ -378,7 +378,7 @@ struct MergeAddImpl {
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
=
false
)
{
p
addle
::
framework
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
p
hi
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
if
(
input_rows
.
size
()
==
0
)
{
return
;
}
...
...
@@ -386,7 +386,7 @@ struct MergeAddImpl {
phi
::
SelectedRows
&
out
=
*
output
;
std
::
set
<
int64_t
>
row_set
(
input_rows
.
begin
(),
input_rows
.
end
());
std
::
vector
<
int64_t
>
merge_rows_cpu
(
row_set
.
begin
(),
row_set
.
end
());
p
addle
::
framework
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
p
hi
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
auto
input_width
=
input
.
value
().
dims
()[
1
];
...
...
@@ -407,8 +407,8 @@ struct MergeAddImpl {
dim3
threads
(
block_size
,
1
);
dim3
grid1
(
input_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
p
hi
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
hi
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
MergeAddKernel
<
T
,
256
><<<
grid1
,
threads
,
0
,
context
.
stream
()
>>>
(
input_data
,
mix_vector_input
.
CUDAData
(
context
.
GetPlace
()),
...
...
@@ -459,7 +459,7 @@ struct MergeAddImpl {
}
std
::
vector
<
int64_t
>
merge_rows_cpu
(
merged_row_set
.
begin
(),
merged_row_set
.
end
());
p
addle
::
framework
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
p
hi
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
out
.
set_rows
(
merge_rows
);
out
.
set_height
(
input_height
);
...
...
@@ -485,8 +485,8 @@ struct MergeAddImpl {
auto
&
input_rows
=
input
->
rows
();
dim3
grid1
(
input_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
p
hi
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
hi
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
MergeAddKernel
<
T
,
256
><<<
grid1
,
threads
,
0
,
context
.
stream
()
>>>
(
input_data
,
mix_vector_input
.
CUDAData
(
context
.
GetPlace
()),
...
...
paddle/phi/kernels/funcs/sequence2batch.cc
浏览文件 @
35d7d1f0
...
...
@@ -22,7 +22,7 @@ class CopyMatrixRowsFunctor<phi::CPUContext, T> {
public:
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
src
,
p
addle
::
framework
::
Vector
<
size_t
>
index_lod
,
p
hi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
is_src_index
)
{
size_t
*
index
=
index_lod
.
data
();
...
...
paddle/phi/kernels/funcs/sequence2batch.cu
浏览文件 @
35d7d1f0
...
...
@@ -43,7 +43,7 @@ class CopyMatrixRowsFunctor<phi::GPUContext, T> {
public:
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
&
src
,
p
addle
::
framework
::
Vector
<
size_t
>
index_lod
,
p
hi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
is_src_index
)
{
auto
src_dims
=
src
.
dims
();
...
...
@@ -79,7 +79,7 @@ class CopyMatrixRowsFunctor<phi::GPUContext, T> {
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
auto
stream
=
context
.
stream
();
p
addle
::
framework
::
MixVector
<
size_t
>
mix_index_lod
(
&
index_lod
);
p
hi
::
MixVector
<
size_t
>
mix_index_lod
(
&
index_lod
);
CopyMatrixRowsKernel
<
T
,
128
,
8
,
8
><<<
grid
,
threads
,
0
,
stream
>>>
(
src_data
,
dst_data
,
...
...
paddle/phi/kernels/funcs/sequence2batch.h
浏览文件 @
35d7d1f0
...
...
@@ -38,7 +38,7 @@ class CopyMatrixRowsFunctor {
// The indexed rows are based on the input index.
void
operator
()(
const
DeviceContext
&
context
,
const
phi
::
DenseTensor
&
src
,
p
addle
::
framework
::
Vector
<
size_t
>
index_lod
,
p
hi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
is_src_index
);
};
...
...
paddle/phi/kernels/funcs/sequence_scale.cu
浏览文件 @
35d7d1f0
...
...
@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
const
size_t
seq_width
=
seq
->
numel
()
/
seq
->
dims
()[
0
];
auto
abs_offset_lod
=
paddle
::
framework
::
ToAbsOffset
(
lod
);
T
*
seq_data
=
context
.
template
Alloc
<
T
>(
seq
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
...
...
paddle/phi/kernels/gpu/adagrad_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -88,7 +88,7 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
phi
::
funcs
::
scatter
::
MergeAdd
<
phi
::
GPUContext
,
T
>
merge_func
;
auto
grad_merge
=
merge_func
(
context
,
grad
);
auto
*
grad_merge_data
=
grad_merge
.
mutable_value
()
->
template
data
<
T
>();
p
addle
::
framework
::
Vector
<
int64_t
>
merge_rows
(
grad_merge
.
rows
());
p
hi
::
Vector
<
int64_t
>
merge_rows
(
grad_merge
.
rows
());
// 2. m += g_m * g_m
auto
grad_square
=
SquareSelectedRows
<
phi
::
GPUContext
,
T
>
(
context
,
grad_merge
);
...
...
@@ -104,7 +104,7 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid2
(
1
,
merge_rows
.
size
());
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_merge_rows
(
&
merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_merge_rows
(
&
merge_rows
);
SparseAdagradFunctorKernel
<
T
,
256
>
<<<
grid2
,
threads
,
...
...
paddle/phi/kernels/gpu/edit_distance_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -87,8 +87,8 @@ void EditDistanceKernel(const Context& ctx,
auto
stream
=
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
();
p
addle
::
framework
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
addle
::
framework
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
bool
use_length
=
hypslength
.
get_ptr
()
!=
nullptr
;
...
...
paddle/phi/kernels/gpu/embedding_grad_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -14,12 +14,12 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
...
...
@@ -173,11 +173,11 @@ struct EmbeddingSparseGradCUDAFunctor {
dim3
threads
(
128
,
8
);
dim3
grids
(
8
,
1
);
auto
stream
=
dev_ctx_
.
stream
();
p
addle
::
framework
::
Vector
<
int64_t
>
new_rows
;
p
hi
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_num
);
auto
gpu_place
=
dev_ctx_
.
GetPlace
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
if
(
!
std
::
is_same
<
IdT
,
int64_t
>::
value
)
{
InputTypeConvert
<<<
grids
,
threads
,
0
,
stream
>>>
(
ids_data
,
ids_num
,
mixv_new_rows
.
MutableData
(
gpu_place
));
...
...
paddle/phi/kernels/gpu/sgd_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -14,12 +14,12 @@
#include "paddle/phi/kernels/sgd_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
phi
{
...
...
@@ -156,7 +156,7 @@ void SGDDenseParamSparseGradKernel(
int
thread_x
=
kThreadsPerBlock
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
SparseSGDFunctorKernel
<<<
max_blocks
,
thread_x
,
0
,
dev_ctx
.
stream
()
>>>
(
in_data
,
mixv_in_rows
.
CUDAData
(
dev_ctx
.
GetPlace
()),
...
...
paddle/phi/kernels/impl/momentum_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -551,7 +551,7 @@ void MomentumSparseImpl(const Context& ctx,
merge_func
(
ctx
,
grad
,
merged_grad
);
auto
*
grad_merge_rows
=
merged_grad
->
mutable_rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
ctx
.
GetPlace
());
int64_t
row_numel
=
merged_grad
->
value
().
numel
()
/
merged_grad
->
rows
().
size
();
funcs
::
ForRange
<
Context
>
for_range
(
ctx
,
param
.
numel
());
...
...
paddle/phi/kernels/impl/rmsprop_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -309,7 +309,7 @@ void RmspropSparseKernel(const Context &ctx,
funcs
::
ForRange
<
Context
>
for_range
(
ctx
,
limit
);
auto
&
grad_merge_rows
=
merged_grad
->
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
&
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
&
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
ctx
.
GetPlace
());
auto
&
merged_tensor
=
merged_grad
->
value
();
...
...
paddle/phi/kernels/impl/warpctc_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -236,8 +236,8 @@ void WarpctcKernel(const Context& dev_ctx,
DenseTensor
*
loss
,
DenseTensor
*
warpctcgrad
)
{
size_t
num_sequences
,
sequence_width
,
max_sequence_length
;
p
addle
::
framework
::
Vector
<
size_t
>
logits_lod
;
p
addle
::
framework
::
Vector
<
size_t
>
label_lod
;
p
hi
::
Vector
<
size_t
>
logits_lod
;
p
hi
::
Vector
<
size_t
>
label_lod
;
if
(
logits_length
.
is_initialized
()
&&
labels_length
.
is_initialized
())
{
num_sequences
=
logits
.
dims
()[
1
];
sequence_width
=
logits
.
dims
()[
2
];
...
...
@@ -397,7 +397,7 @@ void WarpctcKernel(const Context& dev_ctx,
paddle
::
operators
::
math
::
TotalSequenceLength
(
label_lod
)),
1
});
dev_ctx
.
template
HostAlloc
<
int
>(
&
warpctc_label
);
std
::
vector
<
p
addle
::
framework
::
Vector
<
size_t
>>
lod
;
std
::
vector
<
p
hi
::
Vector
<
size_t
>>
lod
;
lod
.
push_back
(
label_lod
);
warpctc_label
.
set_lod
(
lod
);
...
...
paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
浏览文件 @
35d7d1f0
...
...
@@ -126,7 +126,7 @@ void AdamDenseParamSparseGradKernel(
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
...
...
paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -198,7 +198,7 @@ void AdamDenseParamSparseGradKernel(
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
...
...
paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -222,7 +222,7 @@ void AdamwDenseParamSparseGradKernel(
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
...
...
paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
浏览文件 @
35d7d1f0
...
...
@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/cpu/hsigmoid_loss_grad.h"
namespace
phi
{
...
...
@@ -54,7 +54,7 @@ void HSigmoidLossGradKernel(const Context& ctx,
PADDLE_ENFORCE_NOT_NULL
(
path
.
get_ptr
(),
errors
::
NotFound
(
"Custom tree must be set for sparse mode!"
));
p
addle
::
framework
::
Vector
<
int64_t
>
real_rows
=
PathToRows
(
*
path
);
p
hi
::
Vector
<
int64_t
>
real_rows
=
PathToRows
(
*
path
);
w_grad
->
set_rows
(
real_rows
);
// Build a map of id -> row_index to speed up finding the index of one id
w_grad
->
set_height
(
w
.
dims
()[
0
]);
...
...
paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -221,7 +221,7 @@ void ComputeRowImpl(const Context& dev_ctx,
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
if
(
paddle
::
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
&&
...
...
paddle/phi/tests/core/CMakeLists.txt
浏览文件 @
35d7d1f0
...
...
@@ -70,3 +70,20 @@ cc_test(
test_tensor_array
SRCS test_tensor_array.cc
DEPS tensor_array
)
if
(
WITH_GPU
)
nv_test
(
test_mixed_vector
SRCS test_mixed_vector.cc test_mixed_vector.cu
DEPS mixed_vector place memory device_context tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
test_mixed_vector
SRCS test_mixed_vector.cc test_mixed_vector.cu
DEPS mixed_vector place memory device_context tensor
)
else
()
cc_test
(
test_mixed_vector
SRCS test_mixed_vector.cc
DEPS mixed_vector place memory device_context tensor
)
endif
()
paddle/
fluid/framework/mixed_vector_test
.cc
→
paddle/
phi/tests/core/test_mixed_vector
.cc
浏览文件 @
35d7d1f0
...
...
@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/
fluid/framework
/mixed_vector.h"
#include "paddle/
phi/core
/mixed_vector.h"
#include "glog/logging.h"
#include "gtest/gtest-message.h"
...
...
@@ -21,7 +21,7 @@
#include "gtest/gtest_pred_impl.h"
template
<
typename
T
>
using
vec
=
p
addle
::
framework
::
Vector
<
T
>
;
using
vec
=
p
hi
::
Vector
<
T
>
;
TEST
(
mixed_vector
,
CPU_VECTOR
)
{
vec
<
int
>
tmp
;
...
...
@@ -44,7 +44,7 @@ TEST(mixed_vector, CPU_VECTOR) {
}
TEST
(
mixed_vector
,
InitWithCount
)
{
p
addle
::
framework
::
Vector
<
int
>
vec
(
10
,
10
);
p
hi
::
Vector
<
int
>
vec
(
10
,
10
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
vec
[
i
],
10
);
}
...
...
@@ -58,7 +58,7 @@ TEST(mixed_vector, ForEach) {
}
TEST
(
mixed_vector
,
Reserve
)
{
p
addle
::
framework
::
Vector
<
int
>
vec
;
p
hi
::
Vector
<
int
>
vec
;
vec
.
reserve
(
1
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
...
...
@@ -66,7 +66,7 @@ TEST(mixed_vector, Reserve) {
}
TEST
(
mixed_vector
,
Resize
)
{
p
addle
::
framework
::
Vector
<
int
>
vec
;
p
hi
::
Vector
<
int
>
vec
;
vec
.
resize
(
1
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
...
...
paddle/
fluid/framework/mixed_vector_test
.cu
→
paddle/
phi/tests/core/test_mixed_vector
.cu
浏览文件 @
35d7d1f0
...
...
@@ -23,13 +23,14 @@
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/mixed_vector.h"
template
<
typename
T
>
using
vec
=
p
addle
::
framework
::
MixVector
<
T
>
;
using
gpuStream_t
=
p
addle
::
gpuStream_t
;
using
vec
=
p
hi
::
MixVector
<
T
>
;
using
gpuStream_t
=
p
hi
::
gpuStream_t
;
static
__global__
void
multiply_10
(
int
*
ptr
)
{
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
...
...
@@ -37,9 +38,9 @@ static __global__ void multiply_10(int* ptr) {
}
}
gpuStream_t
GetCUDAStream
(
p
addle
::
platform
::
CUDA
Place
place
)
{
gpuStream_t
GetCUDAStream
(
p
hi
::
GPU
Place
place
)
{
return
reinterpret_cast
<
const
phi
::
GPUContext
*>
(
p
addle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
p
hi
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
}
...
...
@@ -50,7 +51,7 @@ TEST(mixed_vector, GPU_VECTOR) {
}
vec
<
int
>
tmp
(
&
x
);
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
p
addle
::
platform
::
CUDA
Place
gpu
(
0
);
p
hi
::
GPU
Place
gpu
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
multiply_10
,
...
...
@@ -69,7 +70,7 @@ TEST(mixed_vector, GPU_VECTOR) {
}
TEST
(
mixed_vector
,
MultiGPU
)
{
if
(
p
addle
::
platform
::
GetGPUDeviceCount
()
<
2
)
{
if
(
p
hi
::
backends
::
gpu
::
GetGPUDeviceCount
()
<
2
)
{
LOG
(
WARNING
)
<<
"Skip mixed_vector.MultiGPU since there are not multiple "
"GPUs in your machine."
;
return
;
...
...
@@ -81,8 +82,8 @@ TEST(mixed_vector, MultiGPU) {
}
vec
<
int
>
tmp
(
&
x
);
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
p
addle
::
platform
::
CUDA
Place
gpu0
(
0
);
p
addle
::
platform
::
SetDeviceId
(
0
);
p
hi
::
GPU
Place
gpu0
(
0
);
p
hi
::
backends
::
gpu
::
SetDeviceId
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
multiply_10
,
...
...
@@ -94,9 +95,9 @@ TEST(mixed_vector, MultiGPU) {
#else
multiply_10
<<<
1
,
1
,
0
,
GetCUDAStream
(
gpu0
)
>>>
(
tmp
.
MutableData
(
gpu0
));
#endif
p
addle
::
platform
::
CUDA
Place
gpu1
(
1
);
p
hi
::
GPU
Place
gpu1
(
1
);
auto
*
gpu1_ptr
=
tmp
.
MutableData
(
gpu1
);
p
addle
::
platform
::
SetDeviceId
(
1
);
p
hi
::
backends
::
gpu
::
SetDeviceId
(
1
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
...
...
tools/parallel_UT_rule.py
浏览文件 @
35d7d1f0
...
...
@@ -913,7 +913,7 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
'test_mix_precision_all_reduce_fuse'
,
'test_spp_op'
,
'test_op_converter'
,
'
mixed_vector_test
'
,
'
test_mixed_vector
'
,
'test_roi_align_op'
,
'test_pad_constant_like'
,
'test_mul_op'
,
...
...
@@ -2288,7 +2288,7 @@ TETRAD_PARALLEL_JOB = [
'device_context_test'
,
'test_reference_count_pass_last_lived_ops'
,
'copy_same_tensor_test'
,
'
mixed_vector_test
'
,
'
test_mixed_vector
'
,
'op_registry_test'
,
'test_prepare_op'
,
'data_device_transform_test'
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录