Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
35d7d1f0
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
35d7d1f0
编写于
2月 08, 2023
作者:
H
Huang Jiyi
提交者:
GitHub
2月 08, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move mixed_vector (#50282)
上级
e92e3aab
变更
83
隐藏空白更改
内联
并排
Showing
83 changed file
with
299 addition
and
305 deletion
+299
-305
paddle/fluid/distributed/collective/reducer.cc
paddle/fluid/distributed/collective/reducer.cc
+3
-3
paddle/fluid/distributed/ps/service/brpc_utils.cc
paddle/fluid/distributed/ps/service/brpc_utils.cc
+1
-1
paddle/fluid/distributed/test/brpc_utils_test.cc
paddle/fluid/distributed/test/brpc_utils_test.cc
+4
-4
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+0
-20
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+1
-1
paddle/fluid/framework/data_type_test.cc
paddle/fluid/framework/data_type_test.cc
+1
-0
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+1
-0
paddle/fluid/framework/dlpack_tensor_test.cc
paddle/fluid/framework/dlpack_tensor_test.cc
+1
-0
paddle/fluid/framework/eigen_test.cc
paddle/fluid/framework/eigen_test.cc
+3
-2
paddle/fluid/framework/fleet/heter_wrapper.cc
paddle/fluid/framework/fleet/heter_wrapper.cc
+2
-2
paddle/fluid/framework/lod_tensor.h
paddle/fluid/framework/lod_tensor.h
+2
-2
paddle/fluid/framework/lod_tensor_test.cu
paddle/fluid/framework/lod_tensor_test.cu
+2
-2
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+2
-2
paddle/fluid/imperative/all_reduce.cc
paddle/fluid/imperative/all_reduce.cc
+4
-4
paddle/fluid/imperative/gloo_context.cc
paddle/fluid/imperative/gloo_context.cc
+2
-2
paddle/fluid/operators/assign_op_test.cc
paddle/fluid/operators/assign_op_test.cc
+1
-1
paddle/fluid/operators/ctc_align_op.cu
paddle/fluid/operators/ctc_align_op.cu
+1
-1
paddle/fluid/operators/cvm_op.cu
paddle/fluid/operators/cvm_op.cu
+1
-1
paddle/fluid/operators/detection/box_clip_op.cu
paddle/fluid/operators/detection/box_clip_op.cu
+1
-1
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+1
-1
paddle/fluid/operators/detection/generate_proposals_op.cu
paddle/fluid/operators/detection/generate_proposals_op.cu
+1
-1
paddle/fluid/operators/detection/target_assign_op.h
paddle/fluid/operators/detection/target_assign_op.h
+2
-2
paddle/fluid/operators/filter_by_instag_op.cu
paddle/fluid/operators/filter_by_instag_op.cu
+8
-8
paddle/fluid/operators/filter_by_instag_op.h
paddle/fluid/operators/filter_by_instag_op.h
+2
-2
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+1
-1
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+6
-6
paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+2
-2
paddle/fluid/operators/gru_op.cc
paddle/fluid/operators/gru_op.cc
+1
-1
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+1
-1
paddle/fluid/operators/gru_op.h
paddle/fluid/operators/gru_op.h
+2
-2
paddle/fluid/operators/lookup_table_op.cu
paddle/fluid/operators/lookup_table_op.cu
+2
-2
paddle/fluid/operators/lookup_table_v2_op.cu
paddle/fluid/operators/lookup_table_v2_op.cu
+2
-2
paddle/fluid/operators/lstm_op.h
paddle/fluid/operators/lstm_op.h
+3
-3
paddle/fluid/operators/lstmp_op.h
paddle/fluid/operators/lstmp_op.h
+3
-3
paddle/fluid/operators/math/beam_search.cu
paddle/fluid/operators/math/beam_search.cu
+2
-2
paddle/fluid/operators/math/sequence_padding.cc
paddle/fluid/operators/math/sequence_padding.cc
+1
-1
paddle/fluid/operators/math/sequence_padding.cu
paddle/fluid/operators/math/sequence_padding.cu
+2
-2
paddle/fluid/operators/math/sequence_padding.h
paddle/fluid/operators/math/sequence_padding.h
+3
-3
paddle/fluid/operators/math/sequence_pooling.cu
paddle/fluid/operators/math/sequence_pooling.cu
+2
-2
paddle/fluid/operators/optimizers/ftrl_op.h
paddle/fluid/operators/optimizers/ftrl_op.h
+1
-1
paddle/fluid/operators/optimizers/sgd_op.cu
paddle/fluid/operators/optimizers/sgd_op.cu
+1
-1
paddle/fluid/operators/row_conv_op.cc
paddle/fluid/operators/row_conv_op.cc
+2
-2
paddle/fluid/operators/row_conv_op.cu
paddle/fluid/operators/row_conv_op.cu
+4
-4
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+1
-1
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+1
-1
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+7
-8
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+16
-20
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+18
-19
paddle/fluid/operators/sequence_ops/sequence_expand_op.h
paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+24
-28
paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+1
-1
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+4
-4
paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+7
-8
paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
...uid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+1
-1
paddle/fluid/operators/shuffle_batch_op.h
paddle/fluid/operators/shuffle_batch_op.h
+2
-2
paddle/fluid/operators/tdm_child_op.h
paddle/fluid/operators/tdm_child_op.h
+1
-1
paddle/fluid/operators/tdm_sampler_op.h
paddle/fluid/operators/tdm_sampler_op.h
+1
-1
paddle/fluid/pybind/tensor.cc
paddle/fluid/pybind/tensor.cc
+1
-1
paddle/phi/core/CMakeLists.txt
paddle/phi/core/CMakeLists.txt
+5
-0
paddle/phi/core/mixed_vector.cc
paddle/phi/core/mixed_vector.cc
+13
-16
paddle/phi/core/mixed_vector.h
paddle/phi/core/mixed_vector.h
+27
-26
paddle/phi/kernels/cpu/edit_distance_kernel.cc
paddle/phi/kernels/cpu/edit_distance_kernel.cc
+3
-3
paddle/phi/kernels/funcs/selected_rows_functor.cc
paddle/phi/kernels/funcs/selected_rows_functor.cc
+4
-4
paddle/phi/kernels/funcs/selected_rows_functor.cu
paddle/phi/kernels/funcs/selected_rows_functor.cu
+11
-11
paddle/phi/kernels/funcs/sequence2batch.cc
paddle/phi/kernels/funcs/sequence2batch.cc
+1
-1
paddle/phi/kernels/funcs/sequence2batch.cu
paddle/phi/kernels/funcs/sequence2batch.cu
+2
-2
paddle/phi/kernels/funcs/sequence2batch.h
paddle/phi/kernels/funcs/sequence2batch.h
+1
-1
paddle/phi/kernels/funcs/sequence_scale.cu
paddle/phi/kernels/funcs/sequence_scale.cu
+1
-1
paddle/phi/kernels/gpu/adagrad_kernel.cu
paddle/phi/kernels/gpu/adagrad_kernel.cu
+2
-2
paddle/phi/kernels/gpu/edit_distance_kernel.cu
paddle/phi/kernels/gpu/edit_distance_kernel.cu
+2
-2
paddle/phi/kernels/gpu/embedding_grad_kernel.cu
paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+3
-3
paddle/phi/kernels/gpu/sgd_kernel.cu
paddle/phi/kernels/gpu/sgd_kernel.cu
+2
-2
paddle/phi/kernels/impl/momentum_kernel_impl.h
paddle/phi/kernels/impl/momentum_kernel_impl.h
+1
-1
paddle/phi/kernels/impl/rmsprop_kernel_impl.h
paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+1
-1
paddle/phi/kernels/impl/warpctc_kernel_impl.h
paddle/phi/kernels/impl/warpctc_kernel_impl.h
+3
-3
paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+1
-1
paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+1
-1
paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+1
-1
paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
...le/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
+2
-2
paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+1
-1
paddle/phi/tests/core/CMakeLists.txt
paddle/phi/tests/core/CMakeLists.txt
+17
-0
paddle/phi/tests/core/test_mixed_vector.cc
paddle/phi/tests/core/test_mixed_vector.cc
+5
-5
paddle/phi/tests/core/test_mixed_vector.cu
paddle/phi/tests/core/test_mixed_vector.cu
+14
-13
tools/parallel_UT_rule.py
tools/parallel_UT_rule.py
+2
-2
未找到文件。
paddle/fluid/distributed/collective/reducer.cc
浏览文件 @
35d7d1f0
...
...
@@ -1113,7 +1113,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
const
auto
&
rank_
=
process_group_
->
GetRank
();
const
auto
&
size_
=
process_group_
->
GetSize
();
framework
::
Vector
<
int64_t
>
rows_num_vector
(
size_
);
phi
::
Vector
<
int64_t
>
rows_num_vector
(
size_
);
rows_num_vector
[
rank_
]
=
static_cast
<
int64_t
>
(
src_rows
.
size
());
Tensor
rows_num_tensor
=
paddle
::
experimental
::
empty
(
...
...
@@ -1183,7 +1183,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
}
process_group_
->
AllGather
(
in
,
out
)
->
Synchronize
();
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
phi
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
...
...
@@ -1262,7 +1262,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
Tensor
dst_rows_tensor
=
paddle
::
experimental
::
concat
(
rows_tensors
,
phi
::
Scalar
(
0
));
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
phi
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
...
...
paddle/fluid/distributed/ps/service/brpc_utils.cc
浏览文件 @
35d7d1f0
...
...
@@ -236,7 +236,7 @@ void DeserializeLodTensor(framework::Variable* var,
framework
::
LoD
lod
;
for
(
int
i
=
0
;
i
<
msg
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
phi
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
msg
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
msg
.
lod
(
i
).
lod_data
(
j
));
}
...
...
paddle/fluid/distributed/test/brpc_utils_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -39,7 +39,7 @@ void CreateVarsOnScope(framework::Scope* scope,
auto
*
tensor1
=
var1
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor1
->
Resize
(
phi
::
make_ddim
({
512
,
8
,
4
,
2
}));
framework
::
LoD
lod1
;
lod1
.
push_back
(
framework
::
Vector
<
size_t
>
({
1
,
3
,
8
}));
lod1
.
push_back
(
phi
::
Vector
<
size_t
>
({
1
,
3
,
8
}));
tensor1
->
set_lod
(
lod1
);
tensor1
->
mutable_data
<
float
>
(
*
place
);
phi
::
funcs
::
set_constant
(
ctx
,
tensor1
,
31.9
);
...
...
@@ -49,7 +49,7 @@ void CreateVarsOnScope(framework::Scope* scope,
auto
*
tensor2
=
var2
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor2
->
Resize
(
phi
::
make_ddim
({
1000
,
64
}));
framework
::
LoD
lod2
;
lod2
.
push_back
(
framework
::
Vector
<
size_t
>
({
1
,
1
}));
lod2
.
push_back
(
phi
::
Vector
<
size_t
>
({
1
,
1
}));
tensor2
->
set_lod
(
lod2
);
tensor2
->
mutable_data
<
int
>
(
*
place
);
phi
::
funcs
::
set_constant
(
ctx
,
tensor2
,
100
);
...
...
@@ -98,7 +98,7 @@ void RunMultiVarMsg(platform::Place place) {
framework
::
Variable
*
var1
=
scope_recv
.
FindVar
(
"x1"
);
auto
*
tensor1
=
var1
->
GetMutable
<
phi
::
DenseTensor
>
();
EXPECT_EQ
(
tensor1
->
dims
(),
phi
::
make_ddim
({
512
,
8
,
4
,
2
}));
// EXPECT_EQ(tensor1->lod(),
framework
::Vector<size_t>({1, 3, 8}));
// EXPECT_EQ(tensor1->lod(),
phi
::Vector<size_t>({1, 3, 8}));
auto
*
tensor_data1
=
const_cast
<
float
*>
(
tensor1
->
data
<
float
>
());
int
tensor_numel1
=
512
*
8
*
4
*
2
;
for
(
int
i
=
0
;
i
<
tensor_numel1
;
++
i
)
...
...
@@ -108,7 +108,7 @@ void RunMultiVarMsg(platform::Place place) {
framework
::
Variable
*
var2
=
scope_recv
.
FindVar
(
"x2"
);
auto
*
tensor2
=
var2
->
GetMutable
<
phi
::
DenseTensor
>
();
EXPECT_EQ
(
tensor2
->
dims
(),
phi
::
make_ddim
({
1000
,
64
}));
// EXPECT_EQ(tensor2->lod(),
framework
::Vector<size_t>({1, 1}));
// EXPECT_EQ(tensor2->lod(),
phi
::Vector<size_t>({1, 1}));
auto
*
tensor_data2
=
const_cast
<
int
*>
(
tensor2
->
data
<
int
>
());
int
tensor_numel2
=
1000
*
64
;
for
(
int
i
=
0
;
i
<
tensor_numel2
;
++
i
)
EXPECT_EQ
(
tensor_data2
[
i
],
100
);
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
35d7d1f0
...
...
@@ -162,27 +162,7 @@ cc_test(
eigen_test
SRCS eigen_test.cc
DEPS tensor
)
cc_library
(
mixed_vector
SRCS mixed_vector.cc
DEPS device_context place memory
)
if
(
WITH_GPU
)
nv_test
(
mixed_vector_test
SRCS mixed_vector_test.cc mixed_vector_test.cu
DEPS mixed_vector place memory device_context tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
mixed_vector_test
SRCS mixed_vector_test.cc mixed_vector_test.cu
DEPS mixed_vector place memory device_context tensor
)
else
()
cc_test
(
mixed_vector_test
SRCS mixed_vector_test.cc
DEPS mixed_vector place memory device_context tensor
)
endif
()
cc_library
(
lod_tensor
SRCS lod_tensor.cc
...
...
paddle/fluid/framework/data_feed.cc
浏览文件 @
35d7d1f0
...
...
@@ -2815,7 +2815,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
LoD
&
lod
=
(
*
feed
->
mutable_lod
());
lod
.
resize
(
1
);
lod
[
0
].
resize
(
offset_cols_size
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
[
0
]);
p
hi
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
[
0
]);
memcpy
(
mixv_lod
.
MutableData
(
platform
::
CPUPlace
()),
off_start_ptr
,
offset_cols_size
*
sizeof
(
size_t
));
...
...
paddle/fluid/framework/data_type_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -18,6 +18,7 @@
#include "gtest/gtest.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/place.h"
TEST
(
DataType
,
float16
)
{
using
paddle
::
platform
::
CPUPlace
;
...
...
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
35d7d1f0
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/dlpack_tensor_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -18,6 +18,7 @@
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/eigen_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/eigen.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/ddim.h"
namespace
paddle
{
...
...
paddle/fluid/framework/fleet/heter_wrapper.cc
浏览文件 @
35d7d1f0
...
...
@@ -158,7 +158,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
LoD
lod
;
for
(
int
i
=
0
;
i
<
req_var
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
phi
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
req_var
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
req_var
.
lod
(
i
).
lod_data
(
j
));
}
...
...
@@ -203,7 +203,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
LoD
lod
;
for
(
int
i
=
0
;
i
<
req_var
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
phi
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
req_var
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
req_var
.
lod
(
i
).
lod_data
(
j
));
}
...
...
paddle/fluid/framework/lod_tensor.h
浏览文件 @
35d7d1f0
...
...
@@ -21,12 +21,12 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -54,7 +54,7 @@ void MergeLoDTensor(phi::DenseTensor* target,
* 0 2 4 7
* 0 2 5 7 10 12 15 20
*/
using
LoD
=
std
::
vector
<
Vector
<
size_t
>>
;
using
LoD
=
std
::
vector
<
phi
::
Vector
<
size_t
>>
;
std
::
string
LoDToString
(
const
LoD
&
lod
);
...
...
paddle/fluid/framework/lod_tensor_test.cu
浏览文件 @
35d7d1f0
...
...
@@ -31,7 +31,7 @@ TEST(LoD, data) {
lod
.
push_back
(
std
::
vector
<
size_t
>
({
0
,
1
,
6
,
8
,
10
,
11
}));
auto
&
v
=
lod
[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector_v
(
&
v
);
p
hi
::
MixVector
<
size_t
>
mix_vector_v
(
&
v
);
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
...
...
@@ -69,7 +69,7 @@ TEST(DenseTensor, LoDInGPU) {
EXPECT_EQ
(
lod_tensor
.
lod_element
(
0
,
4
).
first
,
8UL
);
auto
lod
=
lod_tensor
.
lod
();
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
lod
[
0
]));
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
(
lod
[
0
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
35d7d1f0
...
...
@@ -15,15 +15,15 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace
paddle
{
namespace
framework
{
using
LoD
=
std
::
vector
<
p
addle
::
framework
::
Vector
<
size_t
>>
;
using
LoD
=
std
::
vector
<
p
hi
::
Vector
<
size_t
>>
;
}
// namespace framework
}
// namespace paddle
paddle/fluid/imperative/all_reduce.cc
浏览文件 @
35d7d1f0
...
...
@@ -104,10 +104,10 @@ static void AllReduce(const phi::SelectedRows &src,
// 1. Gather rows number from all workers. Here use ncclAllGather to do this,
// but we can use other ways to implement is in the future
const
auto
&
src_rows
=
src
.
rows
();
framework
::
Vector
<
int64_t
>
rows_num_vector
(
strategy
.
nranks_
);
phi
::
Vector
<
int64_t
>
rows_num_vector
(
strategy
.
nranks_
);
rows_num_vector
[
strategy
.
local_rank_
]
=
static_cast
<
int64_t
>
(
src_rows
.
size
());
// CUDAMutableData use CalStream
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_rows_num_vector
(
&
rows_num_vector
);
p
hi
::
MixVector
<
int64_t
>
mixv_rows_num_vector
(
&
rows_num_vector
);
auto
*
gpu_rows_num_ptr
=
mixv_rows_num_vector
.
CUDAMutableData
(
place
);
VLOG
(
4
)
<<
"start dev_ctx->wait"
;
if
(
!
use_calc_stream
)
{
...
...
@@ -138,9 +138,9 @@ static void AllReduce(const phi::SelectedRows &src,
auto
*
dst_rows
=
dst
->
mutable_rows
();
dst_rows
->
resize
(
rows_num
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
auto
*
dst_rows_ptr
=
mixv_dst_rows
.
CUDAMutableData
(
place
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
const
auto
*
src_rows_ptr
=
mixv_src_rows
.
CUDAData
(
place
);
auto
*
dst_tensor
=
dst
->
mutable_value
();
...
...
paddle/fluid/imperative/gloo_context.cc
浏览文件 @
35d7d1f0
...
...
@@ -158,9 +158,9 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
<<
", height: "
<<
src
.
height
();
auto
*
dst_rows
=
dst
->
mutable_rows
();
dst_rows
->
resize
(
rows_num
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_dst_rows
(
dst_rows
);
auto
*
dst_rows_ptr
=
mixv_dst_rows
.
MutableData
(
place
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_src_rows
(
&
src_rows
);
const
int64_t
*
src_rows_ptr
=
mixv_src_rows
.
Data
(
place
);
auto
*
dst_tensor
=
dst
->
mutable_value
();
...
...
paddle/fluid/operators/assign_op_test.cc
浏览文件 @
35d7d1f0
...
...
@@ -98,7 +98,7 @@ TEST(AssignOp, AssignSelectedRows) {
assign_functor
(
input
);
auto
&
out_selected_row
=
output
.
Get
<
phi
::
SelectedRows
>
();
const
p
addle
::
framework
::
Vector
<
int64_t
>&
out_rows
=
out_selected_row
.
rows
();
const
p
hi
::
Vector
<
int64_t
>&
out_rows
=
out_selected_row
.
rows
();
EXPECT_EQ
(
rows
.
size
(),
out_rows
.
size
());
for
(
size_t
i
=
0
;
i
<
rows
.
size
();
++
i
)
{
EXPECT_EQ
(
rows
[
i
],
out_rows
[
i
]);
...
...
paddle/fluid/operators/ctc_align_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -129,7 +129,7 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
// merge elements and delete blank
T
*
output_data
=
output
->
mutable_data
<
T
>
({
num_tokens
,
1
},
ctx
.
GetPlace
());
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_input_lod
(
&
input_lod
[
level
]);
p
hi
::
MixVector
<
size_t
>
mixv_input_lod
(
&
input_lod
[
level
]);
MergeAndDelCudaKernel
<
T
>
<<<
1
,
1
,
0
,
stream
>>>
(
num_tokens
,
tokens
,
...
...
paddle/fluid/operators/cvm_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -166,7 +166,7 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
lod
[
lod
.
size
()
-
1
],
platform
::
errors
::
PreconditionNotMet
(
"Output(X@GRAD)'s dim[0] must be equal to last element of lod"
));
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
);
p
hi
::
MixVector
<
size_t
>
mixv_lod
(
&
lod
);
CvmGradComputeKernel
<<<
(
dx_numel
+
PADDLE_CUDA_NUM_THREADS
-
1
)
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
...
...
paddle/fluid/operators/detection/box_clip_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -59,7 +59,7 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
auto
stream
=
dev_ctx
.
stream
();
const
size_t
batch_size
=
lod
.
back
().
size
()
-
1
;
T
*
output_data
=
output
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
abs_offset_lod
[
0
]);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
abs_offset_lod
[
0
]);
GPUBoxClip
<
T
,
512
><<<
batch_size
,
512
,
0
,
stream
>>>
(
input
->
data
<
T
>
(),
mix_vector
.
CUDAMutableData
(
dev_ctx
.
GetPlace
()),
...
...
paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -19,7 +19,6 @@ namespace cub = hipcub;
#include <paddle/fluid/memory/allocation/allocator.h>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
...
...
@@ -28,6 +27,7 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
namespace
paddle
{
...
...
paddle/fluid/operators/detection/generate_proposals_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -18,10 +18,10 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/detection/bbox_util.cu.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
...
...
paddle/fluid/operators/detection/target_assign_op.h
浏览文件 @
35d7d1f0
...
...
@@ -121,7 +121,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
auto
x_lod
=
x
->
lod
().
back
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
size_t
*
x_lod_data
=
mixv_x_lod
.
MutableData
(
ctx
.
GetPlace
());
#else
size_t
*
x_lod_data
=
x_lod
.
data
();
...
...
@@ -155,7 +155,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
const
int
*
neg_idx_data
=
neg_indices
->
data
<
int
>
();
auto
neg_lod
=
neg_indices
->
lod
().
back
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_neg_lod
(
&
neg_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_neg_lod
(
&
neg_lod
);
size_t
*
neg_lod_data
=
mixv_neg_lod
.
MutableData
(
ctx
.
GetPlace
());
#else
size_t
*
neg_lod_data
=
neg_lod
.
data
();
...
...
paddle/fluid/operators/filter_by_instag_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -30,11 +30,11 @@
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/core/mixed_vector.h"
#if defined(PADDLE_WITH_CUDA)
namespace
cg
=
cooperative_groups
;
...
...
@@ -46,7 +46,7 @@ namespace operators {
using
SelectedRows
=
phi
::
SelectedRows
;
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
#define WARP_SIZE 32
#define MAX_WARP_NUM 32
...
...
@@ -376,7 +376,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
}
const
size_t
x2_lods_size
=
x2_lods
.
size
()
-
1
;
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x2_lods
(
&
x2_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_x2_lods
(
&
x2_lods
);
size_t
*
x2_lods_data
=
mixv_x2_lods
.
CUDAMutableData
(
gpu_place
);
...
...
@@ -401,7 +401,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
}
}
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x1_lods
(
&
x1_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_x1_lods
(
&
x1_lods
);
size_t
*
x1_lods_data
=
mixv_x1_lods
.
CUDAMutableData
(
gpu_place
);
auto
*
x1_data
=
x1
->
data
<
T
>
();
...
...
@@ -433,12 +433,12 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
Vector
<
size_t
>
out_lods
(
x2_lods_size
+
1
,
0
);
Vector
<
size_t
>
map_lods
(
x2_lods_size
+
1
,
0
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_lods
(
&
out_lods
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_out_lods
(
&
out_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
// thrust::device_vector<size_t> out_idx(1);
Vector
<
size_t
>
out_idx
(
1
,
0
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_idx
(
&
out_idx
);
p
hi
::
MixVector
<
size_t
>
mixv_out_idx
(
&
out_idx
);
size_t
*
out_idx_data
=
mixv_out_idx
.
CUDAMutableData
(
gpu_place
);
size_t
*
out_lods_data
=
mixv_out_lods
.
CUDAMutableData
(
gpu_place
);
...
...
@@ -500,7 +500,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
}
else
{
Vector
<
size_t
>
map_lods
(
2
,
0
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
p
hi
::
MixVector
<
size_t
>
mixv_map_lods
(
&
map_lods
);
thrust
::
device_ptr
<
int64_t
>
map_data_ptr
(
map_data
);
map_data_ptr
[
0
]
=
0
;
...
...
paddle/fluid/operators/filter_by_instag_op.h
浏览文件 @
35d7d1f0
...
...
@@ -23,16 +23,16 @@
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
using
SelectedRows
=
phi
::
SelectedRows
;
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
template
<
typename
T
>
class
FilterByInstagKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
浏览文件 @
35d7d1f0
...
...
@@ -256,7 +256,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto
lod
=
ids
->
lod
()[
0
];
int64_t
out_width
=
d_output
->
dims
()[
1
];
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
phi
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
new_rows
->
resize
(
ids_num
);
std
::
memcpy
(
&
(
*
new_rows
)[
0
],
ids_data
,
ids_num
*
sizeof
(
int64_t
));
...
...
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -14,16 +14,16 @@
#include <string>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
#define CUDA_KERNEL_LOOP(i, n) \
for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
...
...
@@ -441,7 +441,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
int
embedding_size
=
inputs
[
0
]
->
numel
()
/
inputs
[
0
]
->
dims
()[
0
];
int
batch_size
=
-
1
;
std
::
vector
<
p
addle
::
framework
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
std
::
vector
<
p
hi
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
for
(
size_t
i
=
0
;
i
<
slot_size
;
++
i
)
{
const
auto
*
input
=
inputs
[
i
];
...
...
@@ -480,7 +480,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
}
output_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
output
,
output
->
numel
()
*
sizeof
(
T
)));
mix_lods_v
[
i
]
=
new
p
addle
::
framework
::
MixVector
<
size_t
>
(
&
lods
);
mix_lods_v
[
i
]
=
new
p
hi
::
MixVector
<
size_t
>
(
&
lods
);
lods_data
[
i
]
=
mix_lods_v
[
i
]
->
CUDAData
(
ctx
.
GetPlace
());
seqpool_outputs
[
i
].
Resize
({
batch_size
,
embedding_size
});
seqpool_output_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
...
...
@@ -527,7 +527,7 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
int
embedding_size
=
in_grads
[
0
]
->
numel
()
/
in_grads
[
0
]
->
dims
()[
0
];
int
batch_size
=
-
1
;
std
::
vector
<
p
addle
::
framework
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
std
::
vector
<
p
hi
::
MixVector
<
size_t
>
*>
mix_lods_v
(
slot_size
);
for
(
size_t
i
=
0
;
i
<
slot_size
;
++
i
)
{
auto
*
in_grad
=
in_grads
[
i
];
...
...
@@ -563,7 +563,7 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
in_grads_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
in_grad
,
in_grad
->
numel
()
*
sizeof
(
T
)));
mix_lods_v
[
i
]
=
new
p
addle
::
framework
::
MixVector
<
size_t
>
(
&
lods
);
mix_lods_v
[
i
]
=
new
p
hi
::
MixVector
<
size_t
>
(
&
lods
);
lods_data
[
i
]
=
mix_lods_v
[
i
]
->
CUDAData
(
ctx
.
GetPlace
());
cvm_data
[
i
]
=
reinterpret_cast
<
const
T
*>
(
cvm
->
data
<
T
>
());
}
...
...
paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
浏览文件 @
35d7d1f0
...
...
@@ -17,11 +17,11 @@ limitations under the License. */
#include <memory>
#include "dnnl.hpp" // NOLINT
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/fused/multi_gru_op.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -678,7 +678,7 @@ class MultiGRUHandler {
const
std
::
vector
<
const
phi
::
DenseTensor
*>
biases_
;
phi
::
DenseTensor
*
hidden_
;
std
::
vector
<
dnnl
::
primitive_attr
>
attrs_
;
const
p
addle
::
framework
::
Vector
<
size_t
>&
x_lod_
;
const
p
hi
::
Vector
<
size_t
>&
x_lod_
;
};
template
<
typename
T
>
...
...
paddle/fluid/operators/gru_op.cc
浏览文件 @
35d7d1f0
...
...
@@ -372,7 +372,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
phi
::
DenseTensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
35d7d1f0
...
...
@@ -75,7 +75,7 @@ class GRUKernel : public framework::OpKernel<T> {
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
phi
::
DenseTensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
...
...
paddle/fluid/operators/gru_op.h
浏览文件 @
35d7d1f0
...
...
@@ -28,7 +28,7 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
phi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
indexed_src
)
{
phi
::
funcs
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
...
...
@@ -79,7 +79,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
phi
::
DenseTensor
ordered_h0
,
ordered_h0_grad
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
...
...
paddle/fluid/operators/lookup_table_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -169,12 +169,12 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto
stream
=
dev_ctx
.
stream
();
// copy GPU memory to CPU pinned memory
framework
::
Vector
<
int64_t
>
new_rows
;
phi
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_num
);
auto
gpu_place
=
context
.
GetPlace
();
// TODO(yuyang18): Strange code here.
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
memory
::
Copy
(
gpu_place
,
mixv_new_rows
.
CUDAMutableData
(
context
.
GetPlace
()),
gpu_place
,
...
...
paddle/fluid/operators/lookup_table_v2_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -159,11 +159,11 @@ struct LookupTableV2GradCUDAFunctor {
dim3
threads
(
128
,
8
);
dim3
grids
(
8
,
1
);
auto
stream
=
dev_ctx
.
stream
();
framework
::
Vector
<
int64_t
>
new_rows
;
phi
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_num
);
auto
gpu_place
=
context_
.
GetPlace
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
if
(
!
std
::
is_same
<
IdT
,
int64_t
>::
value
)
{
InputTypeConvert
<<<
grids
,
threads
,
0
,
stream
>>>
(
ids_data
,
ids_num
,
mixv_new_rows
.
MutableData
(
gpu_place
));
...
...
paddle/fluid/operators/lstm_op.h
浏览文件 @
35d7d1f0
...
...
@@ -27,7 +27,7 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
phi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
indexed_src
)
{
phi
::
funcs
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
...
...
@@ -95,7 +95,7 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value
.
prev_state_value
=
nullptr
;
phi
::
DenseTensor
ordered_c0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
cell_t0
)
{
// Since the batch computing for LSTM reorders the input sequence
...
...
@@ -236,7 +236,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization.
phi
::
DenseTensor
ordered_h0
,
ordered_c0
,
ordered_h0_g
,
ordered_c0_g
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
c0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
...
...
paddle/fluid/operators/lstmp_op.h
浏览文件 @
35d7d1f0
...
...
@@ -70,7 +70,7 @@ class _ClipGradFunctor {
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
src
,
framework
::
Vector
<
size_t
>
index
,
phi
::
Vector
<
size_t
>
index
,
phi
::
DenseTensor
*
dst
,
bool
indexed_src
)
{
phi
::
funcs
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
...
...
@@ -158,7 +158,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
phi
::
DenseTensor
ordered_c0
;
phi
::
DenseTensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
cell_t0
)
{
// Since the batch computing for LSTMP reorders the input sequence
...
...
@@ -350,7 +350,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
// initialization.
phi
::
DenseTensor
ordered_h0
,
ordered_c0
,
ordered_h0_g
,
ordered_c0_g
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
phi
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
c0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
...
...
paddle/fluid/operators/math/beam_search.cu
浏览文件 @
35d7d1f0
...
...
@@ -446,8 +446,8 @@ class BeamSearchFunctor<phi::GPUContext, T> {
framework
::
LoD
selected_lod
(
2
);
selected_lod
[
0
].
assign
(
abs_lod
[
level
].
begin
(),
abs_lod
[
level
].
end
());
selected_lod
[
1
].
resize
(
scores
->
dims
()[
0
]
+
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
selected_lod
[
1
]);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_abs
(
&
abs_lod
[
level
]);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
selected_lod
[
1
]);
p
hi
::
MixVector
<
size_t
>
mixv_abs
(
&
abs_lod
[
level
]);
size_t
*
selected_offsets
=
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
());
if
(
num_seqs
==
1
)
{
...
...
paddle/fluid/operators/math/sequence_padding.cc
浏览文件 @
35d7d1f0
...
...
@@ -28,7 +28,7 @@ namespace math {
template
<
typename
T
>
void
CopyValidData
(
phi
::
DenseTensor
*
dst_tensor
,
const
phi
::
DenseTensor
*
src_tensor
,
const
framework
::
Vector
<
size_t
>&
seq_offsets
,
const
phi
::
Vector
<
size_t
>&
seq_offsets
,
int
pad_seq_len
,
int
step_width
,
bool
norm_by_len
,
...
...
paddle/fluid/operators/math/sequence_padding.cu
浏览文件 @
35d7d1f0
...
...
@@ -124,7 +124,7 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
p
hi
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kSeqToPad
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
pad_data
,
seq_data
,
...
...
@@ -191,7 +191,7 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
const
T
*
pad_data
=
pad_tensor
.
data
<
T
>
();
T
*
seq_data
=
seq_tensor
->
data
<
T
>
();
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
p
hi
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kPadToSeq
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
seq_data
,
pad_data
,
...
...
paddle/fluid/operators/math/sequence_padding.h
浏览文件 @
35d7d1f0
...
...
@@ -29,7 +29,7 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
enum
CopyType
{
kSeqToPad
,
kPadToSeq
};
inline
static
size_t
MaximumSequenceLength
(
const
framework
::
Vector
<
size_t
>&
seq_offset
)
{
const
phi
::
Vector
<
size_t
>&
seq_offset
)
{
size_t
seq_num
=
seq_offset
.
size
()
-
1
;
size_t
max_seq_len
=
0
;
for
(
size_t
i
=
0
;
i
<
seq_num
;
++
i
)
{
...
...
@@ -39,7 +39,7 @@ inline static size_t MaximumSequenceLength(
}
inline
static
size_t
TotalSequenceLength
(
const
framework
::
Vector
<
size_t
>&
seq_offset
)
{
const
phi
::
Vector
<
size_t
>&
seq_offset
)
{
size_t
seq_num
=
seq_offset
.
size
()
-
1
;
size_t
total_seq_len
=
0
;
for
(
size_t
i
=
0
;
i
<
seq_num
;
++
i
)
{
...
...
@@ -50,7 +50,7 @@ inline static size_t TotalSequenceLength(
inline
static
void
CheckDims
(
const
framework
::
DDim
&
seq_tensor_dims
,
const
framework
::
DDim
&
pad_tensor_dims
,
const
framework
::
Vector
<
size_t
>&
seq_offset
,
const
phi
::
Vector
<
size_t
>&
seq_offset
,
int64_t
padded_seq_len
,
int64_t
step_width
,
const
PadLayout
&
layout
)
{
...
...
paddle/fluid/operators/math/sequence_pooling.cu
浏览文件 @
35d7d1f0
...
...
@@ -203,7 +203,7 @@ class SequencePoolFunctor<phi::GPUContext, T> {
const
size_t
item_dim
=
output
->
numel
()
/
output
->
dims
()[
0
];
dim3
threads
(
1024
,
1
);
dim3
grid
(
std
::
max
(
static_cast
<
int
>
(
lod
.
size
())
-
1
,
1
),
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
if
(
pooltype
==
"MAX"
)
{
sequence_pool_kernel
<
T
,
MaxPoolFunctor
<
T
>>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
...
...
@@ -421,7 +421,7 @@ class SequencePoolGradFunctor<phi::GPUContext, T> {
const
size_t
item_dim
=
in_grad
->
numel
()
/
in_grad
->
dims
()[
0
];
dim3
threads
(
1024
,
1
);
dim3
grid
(
std
::
max
(
static_cast
<
int
>
(
lod
.
size
())
-
1
,
1
),
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
lod
);
if
(
pooltype
==
"MAX"
)
{
sequence_pool_grad_kernel
<
T
,
MaxPoolGradFunctor
<
T
>>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
...
...
paddle/fluid/operators/optimizers/ftrl_op.h
浏览文件 @
35d7d1f0
...
...
@@ -197,7 +197,7 @@ class FTRLOpKernel : public framework::OpKernel<T> {
ctx
.
template
device_context
<
DeviceContext
>(),
*
grad
,
merged_grad
);
auto
*
merged_rows
=
merged_grad
->
mutable_rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_merged_rows
(
merged_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_merged_rows
(
merged_rows
);
const
int64_t
*
rows
=
mixv_merged_rows
.
Data
(
ctx
.
GetPlace
());
auto
row_numel
=
static_cast
<
int64_t
>
(
merged_grad
->
value
().
dims
()[
1
]);
auto
row_height
=
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
());
...
...
paddle/fluid/operators/optimizers/sgd_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -164,7 +164,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
int
thread_x
=
kThreadsPerBlock
;
int
max_threads
=
ctx
.
cuda_device_context
().
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
SparseSGDFunctorKernel
<<<
max_blocks
,
thread_x
,
0
,
...
...
paddle/fluid/operators/row_conv_op.cc
浏览文件 @
35d7d1f0
...
...
@@ -153,7 +153,7 @@ class RowConvKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
}
else
{
batch_size
=
x
->
lod
()[
0
].
size
()
-
1
;
}
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
input_dim
=
0
;
int
timesteps
=
0
;
if
(
is_tensor
)
{
...
...
@@ -231,7 +231,7 @@ class RowConvGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
}
else
{
batch_size
=
x
->
lod
()[
0
].
size
()
-
1
;
}
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
timesteps
=
0
;
int
input_dim
=
0
;
if
(
is_tensor
)
{
...
...
paddle/fluid/operators/row_conv_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -338,7 +338,7 @@ class RowConvKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
batch_size
=
X
->
lod
()[
0
].
size
()
-
1
;
}
int
input_dim
=
0
;
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
timesteps
=
X
->
dims
()[
1
];
if
(
is_tensor
)
{
for
(
int
i
=
0
;
i
<
batch_size
+
1
;
i
++
)
{
...
...
@@ -352,7 +352,7 @@ class RowConvKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
int
num_sequence
=
batch_indices
.
size
()
-
1
;
int
future_context
=
Filter
->
dims
()[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
batch_indices
);
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
batch_indices
);
size_t
*
idx
=
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
());
auto
stream
=
context
.
cuda_device_context
().
stream
();
...
...
@@ -397,7 +397,7 @@ class RowConvGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
}
int
input_dim
=
0
;
framework
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
phi
::
Vector
<
size_t
>
batch_indices
(
batch_size
+
1
);
int
timesteps
=
X
->
dims
()[
1
];
if
(
is_tensor
)
{
for
(
int
i
=
0
;
i
<
batch_size
+
1
;
i
++
)
{
...
...
@@ -411,7 +411,7 @@ class RowConvGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
// int input_dim = X->dims()[1];
int
num_sequence
=
batch_indices
.
size
()
-
1
;
int
future_context
=
Filter
->
dims
()[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_batch_indices
(
&
batch_indices
);
p
hi
::
MixVector
<
size_t
>
mixv_batch_indices
(
&
batch_indices
);
size_t
*
idx
=
mixv_batch_indices
.
CUDAMutableData
(
context
.
GetPlace
());
auto
&
device_ctx
=
context
.
cuda_device_context
();
...
...
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -76,7 +76,7 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
out
->
Resize
({
in_dims
[
0
],
win_size
});
auto
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
// Copy LoD to GPU
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_lod0
(
&
lod0
);
p
hi
::
MixVector
<
size_t
>
mixv_lod0
(
&
lod0
);
const
size_t
*
dev_in_lod_ptr
=
mixv_lod0
.
CUDAData
(
context
.
GetPlace
());
// Calc output tensor
CalcOutPut
<<<
(
in_len
-
1
)
/
PADDLE_CUDA_NUM_THREADS
+
1
,
...
...
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -97,7 +97,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
// Copy LoD to GPU
auto
last_lod
=
lod
[
lod
.
size
()
-
1
];
auto
lod_len
=
last_lod
.
size
();
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_last_lod
(
&
last_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_last_lod
(
&
last_lod
);
const
size_t
*
dev_in_lod_ptr
=
mixv_last_lod
.
CUDAData
(
ctx
.
GetPlace
());
// Calc output LoD
thrust
::
device_vector
<
size_t
>
dev_out_lod
(
lod_len
);
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -65,11 +65,10 @@ static __global__ void sequence_expand_as_grad_kernel(
template
<
typename
T
>
struct
SequenceExpandAsFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
int
height
=
x
.
dims
()[
0
];
int
width
=
phi
::
product
(
x
.
dims
())
/
height
;
...
...
@@ -84,7 +83,7 @@ struct SequenceExpandAsFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
block_x
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_expand_as_kernel
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
x
.
data
<
T
>
(),
mixv_ref_lod
.
CUDAData
(
context
.
GetPlace
()),
...
...
@@ -98,7 +97,7 @@ template <typename T>
struct
SequenceExpandAsGradFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand based lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand based lod*/
phi
::
DenseTensor
*
dx
)
{
int
height
=
dx
->
dims
()[
0
];
int
width
=
phi
::
product
(
dx
->
dims
())
/
height
;
...
...
@@ -114,7 +113,7 @@ struct SequenceExpandAsGradFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
block_x
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_expand_as_grad_kernel
<<<
grid_size
,
block_size
,
0
,
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
浏览文件 @
35d7d1f0
...
...
@@ -26,29 +26,26 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandAsFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
};
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandAsGradFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
);
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
);
};
template
<
typename
T
>
struct
SequenceExpandAsFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
int64_t
height
=
x
.
dims
()[
0
];
int64_t
width
=
phi
::
product
(
x
.
dims
())
/
height
;
...
...
@@ -122,11 +119,10 @@ class SequenceExpandAsKernel : public framework::OpKernel<T> {
* */
template
<
typename
T
>
struct
SequenceExpandAsGradFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
)
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
)
{
int64_t
height
=
dx
->
dims
()[
0
];
int64_t
width
=
phi
::
product
(
dx
->
dims
())
/
height
;
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -82,9 +82,9 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
}
}
void
GetOutputOffset
(
const
framework
::
Vector
<
size_t
>&
x_lod
,
const
framework
::
Vector
<
size_t
>&
ref_lod
,
framework
::
Vector
<
size_t
>*
out_offset
)
{
void
GetOutputOffset
(
const
phi
::
Vector
<
size_t
>&
x_lod
,
const
phi
::
Vector
<
size_t
>&
ref_lod
,
phi
::
Vector
<
size_t
>*
out_offset
)
{
size_t
offset
=
0
;
int
lod_size
=
static_cast
<
int
>
(
x_lod
.
size
());
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
x_lod
.
size
());
++
i
)
{
...
...
@@ -99,8 +99,8 @@ template <typename T>
static
int
ExpandByMemoryCopy
(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
x
,
LoDTensor
*
out
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
const
framework
::
Vector
<
size_t
>&
ref_lod
,
const
phi
::
Vector
<
size_t
>&
x_lod
,
const
phi
::
Vector
<
size_t
>&
ref_lod
,
bool
do_copy
)
{
auto
out_data
=
out
->
data
<
T
>
();
auto
x_data
=
x
.
data
<
T
>
();
...
...
@@ -143,12 +143,11 @@ static int ExpandByMemoryCopy(const phi::GPUContext& context,
template
<
typename
T
>
struct
SequenceExpandFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
out
)
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
x
,
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
LoDTensor
*
out
)
{
int
num_copys
=
ExpandByMemoryCopy
<
T
>
(
context
,
x
,
out
,
x_lod
,
ref_lod
,
false
);
// Sometimes direct copies will be faster, this maybe need deeply analysis.
...
...
@@ -157,7 +156,7 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
}
else
{
int
x_item_length
=
x
.
numel
()
/
x
.
dims
()[
0
];
size_t
x_lod_size
=
x_lod
.
size
();
framework
::
Vector
<
size_t
>
out_offset
(
x_lod_size
*
2
+
ref_lod
.
size
());
phi
::
Vector
<
size_t
>
out_offset
(
x_lod_size
*
2
+
ref_lod
.
size
());
GetOutputOffset
(
x_lod
,
ref_lod
,
&
out_offset
);
for
(
size_t
i
=
0
;
i
<
x_lod_size
;
++
i
)
{
...
...
@@ -167,7 +166,7 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
out_offset
[
2
*
x_lod_size
+
i
]
=
ref_lod
[
i
];
}
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
p
hi
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
const
size_t
*
out_offset_data
=
mixv_out_offset
.
CUDAData
(
context
.
GetPlace
());
const
size_t
*
x_lod_data
=
out_offset_data
+
x_lod_size
;
...
...
@@ -197,11 +196,11 @@ template <typename T>
struct
SequenceExpandGradFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand based lod*/
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand based lod*/
LoDTensor
*
dx
)
{
int
x_item_length
=
phi
::
product
(
dx
->
dims
())
/
dx
->
dims
()[
0
];
framework
::
Vector
<
size_t
>
out_offset
(
x_lod
.
size
());
phi
::
Vector
<
size_t
>
out_offset
(
x_lod
.
size
());
GetOutputOffset
(
x_lod
,
ref_lod
,
&
out_offset
);
int
thread_x
=
std
::
min
(
32
,
std
::
max
(
static_cast
<
int
>
(
ref_lod
.
size
()),
16
));
...
...
@@ -210,9 +209,9 @@ struct SequenceExpandGradFunctor<phi::GPUContext, T> {
int
block_x
=
static_cast
<
int
>
(
ref_lod
.
size
());
dim3
block_size
(
thread_x
,
thread_y
,
thread_z
);
dim3
grid_size
(
block_x
,
1
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_x_lod
(
&
x_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_out_offset
(
&
out_offset
);
sequence_expand_grad_kernel
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
dout
.
data
<
T
>
(),
mixv_ref_lod
.
CUDAData
(
context
.
GetPlace
()),
...
...
paddle/fluid/operators/sequence_ops/sequence_expand_op.h
浏览文件 @
35d7d1f0
...
...
@@ -29,32 +29,29 @@ using EigenMatrix = phi::EigenMatrix<T, MajorType, IndexType>;
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
};
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceExpandGradFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
);
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
);
};
template
<
typename
T
>
struct
SequenceExpandFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
x
,
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
)
{
int
out_offset
=
0
;
int
x_item_length
=
x
.
numel
()
/
x
.
dims
()[
0
];
auto
out_data
=
out
->
data
<
T
>
();
...
...
@@ -112,7 +109,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
}
// x lod level is at most 1.
framework
::
Vector
<
size_t
>
out_lod
;
phi
::
Vector
<
size_t
>
out_lod
;
if
(
x_lod
.
size
()
==
1
)
{
out_lod
.
push_back
(
0
);
int
out_offset
=
0
;
...
...
@@ -130,7 +127,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
auto
&
ref_lod
=
*
out
->
mutable_lod
();
ref_lod
[
0
]
=
out_lod
;
}
framework
::
Vector
<
size_t
>
ref_x_lod
;
phi
::
Vector
<
size_t
>
ref_x_lod
;
if
(
x
->
lod
().
size
()
==
1
)
{
ref_x_lod
=
x
->
lod
()[
0
];
}
else
{
...
...
@@ -161,12 +158,11 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
* */
template
<
typename
T
>
struct
SequenceExpandGradFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
framework
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
framework
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
)
{
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
Vector
<
size_t
>&
x_lod
,
/*expand source lod*/
const
phi
::
Vector
<
size_t
>&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
dx
)
{
int
dout_offset
=
0
;
for
(
size_t
i
=
1
;
i
<
ref_lod
.
size
();
++
i
)
{
int
repeat_num
=
ref_lod
[
i
]
-
ref_lod
[
i
-
1
];
...
...
@@ -214,8 +210,8 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
return
;
}
framework
::
Vector
<
size_t
>
ref_x_lod
;
framework
::
Vector
<
size_t
>
ref_lod
=
y_lod
[
ref_level
];
phi
::
Vector
<
size_t
>
ref_x_lod
;
phi
::
Vector
<
size_t
>
ref_lod
=
y_lod
[
ref_level
];
if
(
x
->
lod
().
size
()
==
1
)
{
ref_x_lod
=
x
->
lod
()[
0
];
}
else
{
...
...
paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
浏览文件 @
35d7d1f0
...
...
@@ -139,7 +139,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
xlod
=
x
.
lod
()[
0
];
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_xlod
(
&
xlod
);
p
hi
::
MixVector
<
size_t
>
mixv_xlod
(
&
xlod
);
lod
=
mixv_xlod
.
CUDAData
(
ctx
.
GetPlace
());
}
else
{
#endif
...
...
paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
浏览文件 @
35d7d1f0
...
...
@@ -124,7 +124,7 @@ template <typename T>
struct
SequenceSoftmaxFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
LoDTensor
*
out
)
{
int
height
=
ref_lod
.
size
()
-
1
;
...
...
@@ -135,7 +135,7 @@ struct SequenceSoftmaxFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
max_blocks
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_softmax_kernel
<
T
,
kThreadsPerBlock
>
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
x
.
data
<
T
>
(),
...
...
@@ -150,7 +150,7 @@ struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
LoDTensor
&
dout
,
const
LoDTensor
&
out
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
LoDTensor
*
dx
)
{
size_t
height
=
ref_lod
.
size
()
-
1
;
...
...
@@ -162,7 +162,7 @@ struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
dim3
block_size
(
thread_x
);
dim3
grid_size
(
max_blocks
);
p
addle
::
framework
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
p
hi
::
MixVector
<
size_t
>
mixv_ref_lod
(
&
ref_lod
);
sequence_softmax_grad_kernel
<
T
,
kThreadsPerBlock
>
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
dout
.
data
<
T
>
(),
...
...
paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
浏览文件 @
35d7d1f0
...
...
@@ -21,11 +21,10 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
struct
SequenceSoftmaxFunctor
{
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*expand referenced lod*/
phi
::
DenseTensor
*
out
);
};
template
<
typename
DeviceContext
,
typename
T
>
...
...
@@ -33,7 +32,7 @@ struct SequenceSoftmaxGradFunctor {
void
operator
()(
const
DeviceContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
DenseTensor
&
out
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
phi
::
DenseTensor
*
dx
);
};
...
...
@@ -41,7 +40,7 @@ template <typename T>
struct
SequenceSoftmaxFunctor
<
phi
::
CPUContext
,
T
>
{
void
operator
()(
const
phi
::
CPUContext
&
ctx
,
const
phi
::
DenseTensor
&
x
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
phi
::
DenseTensor
*
out
)
{
size_t
height
=
ref_lod
.
size
()
-
1
;
const
T
*
in_data
=
x
.
data
<
T
>
();
...
...
@@ -64,7 +63,7 @@ struct SequenceSoftmaxGradFunctor<phi::CPUContext, T> {
void
operator
()(
const
phi
::
CPUContext
&
ctx
,
const
phi
::
DenseTensor
&
dout
,
const
phi
::
DenseTensor
&
out
,
const
framework
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
const
phi
::
Vector
<
size_t
>
&
ref_lod
,
/*referenced lod*/
phi
::
DenseTensor
*
dx
)
{
size_t
height
=
ref_lod
.
size
()
-
1
;
...
...
paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
浏览文件 @
35d7d1f0
...
...
@@ -116,7 +116,7 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
auto
pos_data
=
pos
->
mutable_data
<
int
>
(
context
.
GetPlace
());
int
offset
=
0
;
framework
::
Vector
<
size_t
>
vec_out_lod
;
phi
::
Vector
<
size_t
>
vec_out_lod
;
vec_out_lod
.
reserve
(
batch_size
+
1
);
for
(
int
i
=
0
;
i
<=
batch_size
;
++
i
)
{
offset
=
row_lod
[
i
];
...
...
paddle/fluid/operators/shuffle_batch_op.h
浏览文件 @
35d7d1f0
...
...
@@ -25,16 +25,16 @@
#include "glog/logging.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
using
Vector
=
framework
::
Vector
<
T
>
;
using
Vector
=
phi
::
Vector
<
T
>
;
template
<
typename
T
>
class
ShuffleBatchKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
paddle/fluid/operators/tdm_child_op.h
浏览文件 @
35d7d1f0
...
...
@@ -22,8 +22,8 @@
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/tdm_sampler_op.h
浏览文件 @
35d7d1f0
...
...
@@ -22,9 +22,9 @@
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/pybind/tensor.cc
浏览文件 @
35d7d1f0
...
...
@@ -1095,7 +1095,7 @@ void BindTensor(pybind11::module &m) { // NOLINT
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self
.
set_rows
(
rows
);
#else
V
ector
<
int64_t
>
new_rows
(
rows
);
std
::
v
ector
<
int64_t
>
new_rows
(
rows
);
self
.
set_rows
(
new_rows
);
#endif
})
...
...
paddle/phi/core/CMakeLists.txt
浏览文件 @
35d7d1f0
...
...
@@ -114,6 +114,11 @@ cc_library(
SRCS custom_kernel.cc
DEPS kernel_factory
)
cc_library
(
mixed_vector
SRCS mixed_vector.cc
DEPS device_context place memory
)
# Will remove once we implemented MKLDNN_Tensor
if
(
WITH_MKLDNN
)
add_dependencies
(
dense_tensor mkldnn
)
...
...
paddle/
fluid/framework
/mixed_vector.cc
→
paddle/
phi/core
/mixed_vector.cc
浏览文件 @
35d7d1f0
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/
fluid/framework
/mixed_vector.h"
#include "paddle/
phi/core
/mixed_vector.h"
#include <algorithm>
#include <initializer_list>
...
...
@@ -22,28 +22,26 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/
fluid/platform/device
_context.h"
#include "paddle/
phi/backends/all
_context.h"
#include "paddle/utils/none.h"
#include "paddle/utils/optional.h"
namespace
paddle
{
namespace
framework
{
namespace
phi
{
template
<
typename
T
>
void
CopyToCPUHelper
(
std
::
vector
<
T
>
*
cpu_
,
p
addle
::
memory
::
AllocationPtr
*
gpu_
,
p
hi
::
Allocator
::
AllocationPtr
*
gpu_
,
size_t
*
gpu_memory_size_
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// COPY GPU Data To CPU
auto
*
dev_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
p
latform
::
DeviceContextPool
::
Instance
().
Get
((
*
gpu_
)
->
place
()));
p
hi
::
DeviceContextPool
::
Instance
().
Get
((
*
gpu_
)
->
place
()));
auto
stream
=
dev_ctx
->
stream
();
void
*
src
=
(
*
gpu_
)
->
ptr
();
void
*
dst
=
cpu_
->
data
();
paddle
::
memory
::
Copy
(
p
latform
::
CPUPlace
(),
paddle
::
memory
::
Copy
(
p
hi
::
CPUPlace
(),
dst
,
OptionalCUDAPlace
(
*
gpu_
).
get
(),
src
,
...
...
@@ -55,20 +53,20 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
template
<
typename
T
>
void
CopyCPUDataToCUDAHelper
(
std
::
vector
<
T
>
*
cpu_
,
p
addle
::
memory
::
AllocationPtr
*
gpu_
,
p
hi
::
Allocator
::
AllocationPtr
*
gpu_
,
size_t
*
gpu_memory_size_
,
const
p
latform
::
Place
&
place
)
{
const
p
hi
::
Place
&
place
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
*
src
=
cpu_
->
data
();
*
gpu_memory_size_
=
cpu_
->
size
()
*
sizeof
(
T
);
// sizeof(T)
(
*
gpu_
)
=
memory
::
Alloc
(
place
,
*
gpu_memory_size_
);
(
*
gpu_
)
=
paddle
::
memory
::
Alloc
(
place
,
*
gpu_memory_size_
);
void
*
dst
=
(
*
gpu_
)
->
ptr
();
auto
*
dev_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
p
latform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
p
hi
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
stream
=
dev_ctx
->
stream
();
paddle
::
memory
::
Copy
(
OptionalCUDAPlace
(
*
gpu_
).
get
(),
dst
,
p
latform
::
CPUPlace
(),
p
hi
::
CPUPlace
(),
src
,
*
gpu_memory_size_
,
stream
);
...
...
@@ -84,7 +82,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
\
template <> \
void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \
const p
latform::Place &place) const {
\
const p
hi::Place &place) const {
\
CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
}
...
...
@@ -92,5 +90,4 @@ INSTANTIATE_VECTOR_FOR_TYPE(size_t)
INSTANTIATE_VECTOR_FOR_TYPE
(
int
)
INSTANTIATE_VECTOR_FOR_TYPE
(
int64_t
)
};
// namespace framework
}
// namespace paddle
};
// namespace phi
paddle/
fluid/framework
/mixed_vector.h
→
paddle/
phi/core
/mixed_vector.h
浏览文件 @
35d7d1f0
...
...
@@ -22,20 +22,22 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#include "paddle/utils/none.h"
#include "paddle/utils/optional.h"
namespace
paddle
{
namespace
framework
{
namespace
phi
{
template
<
class
T
>
using
Vector
=
std
::
vector
<
T
>
;
inline
paddle
::
optional
<
p
latform
::
CUDA
Place
>
OptionalCUDAPlace
(
const
p
addle
::
memory
::
allocation
::
AllocationPtr
&
gpu_
)
{
inline
paddle
::
optional
<
p
hi
::
GPU
Place
>
OptionalCUDAPlace
(
const
p
hi
::
Allocator
::
AllocationPtr
&
gpu_
)
{
return
gpu_
==
nullptr
?
paddle
::
none
:
paddle
::
optional
<
p
latform
::
CUDA
Place
>
(
gpu_
->
place
());
:
paddle
::
optional
<
p
hi
::
GPU
Place
>
(
gpu_
->
place
());
}
// Vector<T> implements the std::vector interface, and can get Data or
...
...
@@ -146,18 +148,18 @@ class MixVector {
}
// get cuda ptr. immutable
const
T
*
CUDAData
(
p
latform
::
Place
place
)
const
{
const
T
*
CUDAData
(
p
hi
::
Place
place
)
const
{
PADDLE_ENFORCE_EQ
(
pla
tform
::
is_gpu_place
(
place
)
,
pla
ce
.
GetType
()
==
phi
::
AllocationType
::
GPU
,
true
,
p
latform
::
errors
::
Unavailable
(
p
hi
::
errors
::
Unavailable
(
"Place mismatch, CUDA Data must be on CUDA place."
));
ImmutableCUDA
(
place
);
return
reinterpret_cast
<
T
*>
(
gpu_
->
ptr
());
}
// get cuda ptr. mutable
T
*
CUDAMutableData
(
p
latform
::
Place
place
)
{
T
*
CUDAMutableData
(
p
hi
::
Place
place
)
{
const
T
*
ptr
=
CUDAData
(
place
);
flag_
=
kDirty
|
kDataInCUDA
;
return
const_cast
<
T
*>
(
ptr
);
...
...
@@ -178,7 +180,7 @@ class MixVector {
std
::
mutex
&
Mutex
()
const
{
return
mtx_
;
}
paddle
::
optional
<
p
latform
::
CUDA
Place
>
CUDAPlace
()
const
{
paddle
::
optional
<
p
hi
::
GPU
Place
>
CUDAPlace
()
const
{
return
OptionalCUDAPlace
(
gpu_
);
}
...
...
@@ -199,7 +201,7 @@ class MixVector {
void
CopyToCPU
()
const
;
void
ImmutableCUDA
(
p
latform
::
Place
place
)
const
{
void
ImmutableCUDA
(
p
hi
::
Place
place
)
const
{
if
(
IsDirty
())
{
if
(
IsInCPU
())
{
CopyCPUDataToCUDA
(
place
);
...
...
@@ -207,7 +209,7 @@ class MixVector {
SetFlag
(
kDataInCUDA
);
}
else
if
(
IsInCUDA
()
&&
!
(
place
==
gpu_
->
place
()))
{
PADDLE_THROW
(
p
latform
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
p
hi
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
// Still dirty
}
else
{
// Dirty && DataInCUDA && Device is same
...
...
@@ -220,7 +222,7 @@ class MixVector {
SetFlag
(
kDataInCUDA
);
}
else
if
(
!
(
place
==
gpu_
->
place
()))
{
PADDLE_THROW
(
p
latform
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
p
hi
::
errors
::
Unavailable
(
"Unexpected data place mismatch."
));
}
else
{
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
...
...
@@ -228,7 +230,7 @@ class MixVector {
}
}
void
CopyCPUDataToCUDA
(
const
p
latform
::
Place
&
place
)
const
;
void
CopyCPUDataToCUDA
(
const
p
hi
::
Place
&
place
)
const
;
void
ImmutableCPU
()
const
{
if
(
IsDirty
()
&&
!
IsInCPU
())
{
// If data has been changed in CUDA, or
...
...
@@ -249,7 +251,7 @@ class MixVector {
bool
IsInCPU
()
const
{
return
flag_
&
kDataInCPU
;
}
std
::
vector
<
T
>
*
cpu_
;
mutable
p
addle
::
memory
::
allocation
::
AllocationPtr
gpu_
;
mutable
p
hi
::
Allocator
::
AllocationPtr
gpu_
;
mutable
size_t
gpu_memory_size_
{
0
};
mutable
int
flag_
;
...
...
@@ -332,9 +334,9 @@ class MixVector {
}
// get cuda ptr. immutable
const
T
*
CUDAData
(
p
latform
::
Place
place
)
const
{
const
T
*
CUDAData
(
p
hi
::
Place
place
)
const
{
{
p
latform
::
CUDA
Place
p
(
place
.
GetDeviceId
());
p
hi
::
GPU
Place
p
(
place
.
GetDeviceId
());
auto
&
mtx
=
m_
->
Mutex
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx
);
auto
cuda_place
=
m_
->
CUDAPlace
();
...
...
@@ -348,9 +350,9 @@ class MixVector {
}
// get cuda ptr. mutable
T
*
CUDAMutableData
(
p
latform
::
Place
place
)
{
T
*
CUDAMutableData
(
p
hi
::
Place
place
)
{
{
p
latform
::
CUDA
Place
p
(
place
.
GetDeviceId
());
p
hi
::
GPU
Place
p
(
place
.
GetDeviceId
());
auto
&
mtx
=
m_
->
Mutex
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx
);
auto
cuda_place
=
m_
->
CUDAPlace
();
...
...
@@ -372,8 +374,8 @@ class MixVector {
void
reserve
(
size_t
size
)
{
m_
->
reserve
(
size
);
}
// the unify method to access CPU or CUDA data. immutable.
const
T
*
Data
(
p
latform
::
Place
place
)
const
{
if
(
pla
tform
::
is_gpu_place
(
place
)
)
{
const
T
*
Data
(
p
hi
::
Place
place
)
const
{
if
(
pla
ce
.
GetType
()
==
phi
::
AllocationType
::
GPU
)
{
return
CUDAData
(
place
);
}
else
{
return
data
();
...
...
@@ -381,8 +383,8 @@ class MixVector {
}
// the unify method to access CPU or CUDA data. mutable.
T
*
MutableData
(
p
latform
::
Place
place
)
{
if
(
pla
tform
::
is_gpu_place
(
place
)
)
{
T
*
MutableData
(
p
hi
::
Place
place
)
{
if
(
pla
ce
.
GetType
()
==
phi
::
AllocationType
::
GPU
)
{
return
CUDAMutableData
(
place
);
}
else
{
return
data
();
...
...
@@ -397,5 +399,4 @@ class MixVector {
mutable
std
::
unique_ptr
<
VectorData
>
m_
;
};
};
// namespace framework
}
// namespace paddle
};
// namespace phi
paddle/phi/kernels/cpu/edit_distance_kernel.cc
浏览文件 @
35d7d1f0
...
...
@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/edit_distance_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace
phi
{
...
...
@@ -34,8 +34,8 @@ void EditDistanceKernel(const Context& ctx,
int64_t
*
seq_num_data
=
ctx
.
template
Alloc
<
int64_t
>(
sequencenum
);
auto
batch_size
=
hyps
.
dims
()[
0
];
p
addle
::
framework
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
addle
::
framework
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
bool
use_length
=
hypslength
.
get_ptr
()
!=
nullptr
;
...
...
paddle/phi/kernels/funcs/selected_rows_functor.cc
浏览文件 @
35d7d1f0
...
...
@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/core/mixed_vector.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/phi/backends/onednn/axpy_handler.h"
...
...
@@ -200,7 +200,7 @@ struct SelectedRowsAddTo<phi::CPUContext, T> {
auto
*
in2_value
=
input2
->
mutable_value
();
// concat rows
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
mixv_in2_rows
.
Extend
(
in1_rows
.
begin
(),
in1_rows
.
end
());
auto
in1_place
=
input1
.
place
();
...
...
@@ -254,7 +254,7 @@ struct SelectedRowsSumTo<phi::CPUContext, T> {
std
::
vector
<
int64_t
>
in2_rows
;
in2_rows
.
reserve
(
in2_rows
.
size
()
+
size
);
for
(
auto
iter
=
input1
.
begin
();
iter
!=
input1
.
end
();
++
iter
)
{
const
p
addle
::
framework
::
Vector
<
int64_t
>&
in_rows
=
(
*
iter
)
->
rows
();
const
p
hi
::
Vector
<
int64_t
>&
in_rows
=
(
*
iter
)
->
rows
();
in2_rows
.
insert
(
in2_rows
.
end
(),
in_rows
.
begin
(),
in_rows
.
end
());
}
input2
->
set_rows
(
in2_rows
);
...
...
@@ -646,7 +646,7 @@ struct MergeAdd<phi::XPUContext, T> {
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
=
false
)
{
p
addle
::
framework
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
p
hi
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
if
(
input_rows
.
size
()
==
0
)
{
return
;
}
...
...
paddle/phi/kernels/funcs/selected_rows_functor.cu
浏览文件 @
35d7d1f0
...
...
@@ -40,7 +40,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
input2
.
height
()));
output
->
set_height
(
in1_height
);
p
addle
::
framework
::
Vector
<
int64_t
>
in1_rows
(
input1
.
rows
());
p
hi
::
Vector
<
int64_t
>
in1_rows
(
input1
.
rows
());
auto
&
in2_rows
=
input2
.
rows
();
std
::
vector
<
int64_t
>
out_rows
;
out_rows
.
reserve
(
in1_rows
.
size
()
+
in2_rows
.
size
());
...
...
@@ -189,7 +189,7 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
...
...
@@ -231,7 +231,7 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
auto
*
in2_value
=
input2
->
mutable_value
();
// concat rows
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in2_rows
(
&
in2_rows
);
if
(
in1_rows
.
size
())
{
mixv_in2_rows
.
Extend
(
in1_rows
.
begin
(),
in1_rows
.
end
());
}
...
...
@@ -318,7 +318,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddToTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
...
...
@@ -378,7 +378,7 @@ struct MergeAddImpl {
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
=
false
)
{
p
addle
::
framework
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
p
hi
::
Vector
<
int64_t
>
input_rows
(
input
.
rows
());
if
(
input_rows
.
size
()
==
0
)
{
return
;
}
...
...
@@ -386,7 +386,7 @@ struct MergeAddImpl {
phi
::
SelectedRows
&
out
=
*
output
;
std
::
set
<
int64_t
>
row_set
(
input_rows
.
begin
(),
input_rows
.
end
());
std
::
vector
<
int64_t
>
merge_rows_cpu
(
row_set
.
begin
(),
row_set
.
end
());
p
addle
::
framework
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
p
hi
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
auto
input_width
=
input
.
value
().
dims
()[
1
];
...
...
@@ -407,8 +407,8 @@ struct MergeAddImpl {
dim3
threads
(
block_size
,
1
);
dim3
grid1
(
input_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
p
hi
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
hi
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
MergeAddKernel
<
T
,
256
><<<
grid1
,
threads
,
0
,
context
.
stream
()
>>>
(
input_data
,
mix_vector_input
.
CUDAData
(
context
.
GetPlace
()),
...
...
@@ -459,7 +459,7 @@ struct MergeAddImpl {
}
std
::
vector
<
int64_t
>
merge_rows_cpu
(
merged_row_set
.
begin
(),
merged_row_set
.
end
());
p
addle
::
framework
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
p
hi
::
Vector
<
int64_t
>
merge_rows
(
merge_rows_cpu
);
out
.
set_rows
(
merge_rows
);
out
.
set_height
(
input_height
);
...
...
@@ -485,8 +485,8 @@ struct MergeAddImpl {
auto
&
input_rows
=
input
->
rows
();
dim3
grid1
(
input_rows
.
size
(),
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
p
hi
::
MixVector
<
int64_t
>
mix_vector_input
(
&
input_rows
);
p
hi
::
MixVector
<
int64_t
>
mix_vector_out
(
out
.
mutable_rows
());
MergeAddKernel
<
T
,
256
><<<
grid1
,
threads
,
0
,
context
.
stream
()
>>>
(
input_data
,
mix_vector_input
.
CUDAData
(
context
.
GetPlace
()),
...
...
paddle/phi/kernels/funcs/sequence2batch.cc
浏览文件 @
35d7d1f0
...
...
@@ -22,7 +22,7 @@ class CopyMatrixRowsFunctor<phi::CPUContext, T> {
public:
void
operator
()(
const
phi
::
CPUContext
&
context
,
const
phi
::
DenseTensor
&
src
,
p
addle
::
framework
::
Vector
<
size_t
>
index_lod
,
p
hi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
is_src_index
)
{
size_t
*
index
=
index_lod
.
data
();
...
...
paddle/phi/kernels/funcs/sequence2batch.cu
浏览文件 @
35d7d1f0
...
...
@@ -43,7 +43,7 @@ class CopyMatrixRowsFunctor<phi::GPUContext, T> {
public:
void
operator
()(
const
phi
::
GPUContext
&
context
,
const
phi
::
DenseTensor
&
src
,
p
addle
::
framework
::
Vector
<
size_t
>
index_lod
,
p
hi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
is_src_index
)
{
auto
src_dims
=
src
.
dims
();
...
...
@@ -79,7 +79,7 @@ class CopyMatrixRowsFunctor<phi::GPUContext, T> {
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
auto
stream
=
context
.
stream
();
p
addle
::
framework
::
MixVector
<
size_t
>
mix_index_lod
(
&
index_lod
);
p
hi
::
MixVector
<
size_t
>
mix_index_lod
(
&
index_lod
);
CopyMatrixRowsKernel
<
T
,
128
,
8
,
8
><<<
grid
,
threads
,
0
,
stream
>>>
(
src_data
,
dst_data
,
...
...
paddle/phi/kernels/funcs/sequence2batch.h
浏览文件 @
35d7d1f0
...
...
@@ -38,7 +38,7 @@ class CopyMatrixRowsFunctor {
// The indexed rows are based on the input index.
void
operator
()(
const
DeviceContext
&
context
,
const
phi
::
DenseTensor
&
src
,
p
addle
::
framework
::
Vector
<
size_t
>
index_lod
,
p
hi
::
Vector
<
size_t
>
index_lod
,
phi
::
DenseTensor
*
dst
,
bool
is_src_index
);
};
...
...
paddle/phi/kernels/funcs/sequence_scale.cu
浏览文件 @
35d7d1f0
...
...
@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
const
size_t
seq_width
=
seq
->
numel
()
/
seq
->
dims
()[
0
];
auto
abs_offset_lod
=
paddle
::
framework
::
ToAbsOffset
(
lod
);
T
*
seq_data
=
context
.
template
Alloc
<
T
>(
seq
);
p
addle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
p
hi
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
...
...
paddle/phi/kernels/gpu/adagrad_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -88,7 +88,7 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
phi
::
funcs
::
scatter
::
MergeAdd
<
phi
::
GPUContext
,
T
>
merge_func
;
auto
grad_merge
=
merge_func
(
context
,
grad
);
auto
*
grad_merge_data
=
grad_merge
.
mutable_value
()
->
template
data
<
T
>();
p
addle
::
framework
::
Vector
<
int64_t
>
merge_rows
(
grad_merge
.
rows
());
p
hi
::
Vector
<
int64_t
>
merge_rows
(
grad_merge
.
rows
());
// 2. m += g_m * g_m
auto
grad_square
=
SquareSelectedRows
<
phi
::
GPUContext
,
T
>
(
context
,
grad_merge
);
...
...
@@ -104,7 +104,7 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid2
(
1
,
merge_rows
.
size
());
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_merge_rows
(
&
merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_merge_rows
(
&
merge_rows
);
SparseAdagradFunctorKernel
<
T
,
256
>
<<<
grid2
,
threads
,
...
...
paddle/phi/kernels/gpu/edit_distance_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -87,8 +87,8 @@ void EditDistanceKernel(const Context& ctx,
auto
stream
=
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
();
p
addle
::
framework
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
addle
::
framework
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
hyp_lod
(
batch_size
+
1
);
p
hi
::
Vector
<
size_t
>
ref_lod
(
batch_size
+
1
);
bool
use_length
=
hypslength
.
get_ptr
()
!=
nullptr
;
...
...
paddle/phi/kernels/gpu/embedding_grad_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -14,12 +14,12 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
...
...
@@ -173,11 +173,11 @@ struct EmbeddingSparseGradCUDAFunctor {
dim3
threads
(
128
,
8
);
dim3
grids
(
8
,
1
);
auto
stream
=
dev_ctx_
.
stream
();
p
addle
::
framework
::
Vector
<
int64_t
>
new_rows
;
p
hi
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_num
);
auto
gpu_place
=
dev_ctx_
.
GetPlace
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_new_rows
(
&
new_rows
);
if
(
!
std
::
is_same
<
IdT
,
int64_t
>::
value
)
{
InputTypeConvert
<<<
grids
,
threads
,
0
,
stream
>>>
(
ids_data
,
ids_num
,
mixv_new_rows
.
MutableData
(
gpu_place
));
...
...
paddle/phi/kernels/gpu/sgd_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -14,12 +14,12 @@
#include "paddle/phi/kernels/sgd_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
namespace
phi
{
...
...
@@ -156,7 +156,7 @@ void SGDDenseParamSparseGradKernel(
int
thread_x
=
kThreadsPerBlock
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_in_rows
(
&
in_rows
);
SparseSGDFunctorKernel
<<<
max_blocks
,
thread_x
,
0
,
dev_ctx
.
stream
()
>>>
(
in_data
,
mixv_in_rows
.
CUDAData
(
dev_ctx
.
GetPlace
()),
...
...
paddle/phi/kernels/impl/momentum_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -551,7 +551,7 @@ void MomentumSparseImpl(const Context& ctx,
merge_func
(
ctx
,
grad
,
merged_grad
);
auto
*
grad_merge_rows
=
merged_grad
->
mutable_rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
ctx
.
GetPlace
());
int64_t
row_numel
=
merged_grad
->
value
().
numel
()
/
merged_grad
->
rows
().
size
();
funcs
::
ForRange
<
Context
>
for_range
(
ctx
,
param
.
numel
());
...
...
paddle/phi/kernels/impl/rmsprop_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -309,7 +309,7 @@ void RmspropSparseKernel(const Context &ctx,
funcs
::
ForRange
<
Context
>
for_range
(
ctx
,
limit
);
auto
&
grad_merge_rows
=
merged_grad
->
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
&
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
&
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
ctx
.
GetPlace
());
auto
&
merged_tensor
=
merged_grad
->
value
();
...
...
paddle/phi/kernels/impl/warpctc_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -236,8 +236,8 @@ void WarpctcKernel(const Context& dev_ctx,
DenseTensor
*
loss
,
DenseTensor
*
warpctcgrad
)
{
size_t
num_sequences
,
sequence_width
,
max_sequence_length
;
p
addle
::
framework
::
Vector
<
size_t
>
logits_lod
;
p
addle
::
framework
::
Vector
<
size_t
>
label_lod
;
p
hi
::
Vector
<
size_t
>
logits_lod
;
p
hi
::
Vector
<
size_t
>
label_lod
;
if
(
logits_length
.
is_initialized
()
&&
labels_length
.
is_initialized
())
{
num_sequences
=
logits
.
dims
()[
1
];
sequence_width
=
logits
.
dims
()[
2
];
...
...
@@ -397,7 +397,7 @@ void WarpctcKernel(const Context& dev_ctx,
paddle
::
operators
::
math
::
TotalSequenceLength
(
label_lod
)),
1
});
dev_ctx
.
template
HostAlloc
<
int
>(
&
warpctc_label
);
std
::
vector
<
p
addle
::
framework
::
Vector
<
size_t
>>
lod
;
std
::
vector
<
p
hi
::
Vector
<
size_t
>>
lod
;
lod
.
push_back
(
label_lod
);
warpctc_label
.
set_lod
(
lod
);
...
...
paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
浏览文件 @
35d7d1f0
...
...
@@ -126,7 +126,7 @@ void AdamDenseParamSparseGradKernel(
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
...
...
paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -198,7 +198,7 @@ void AdamDenseParamSparseGradKernel(
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
...
...
paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
浏览文件 @
35d7d1f0
...
...
@@ -222,7 +222,7 @@ void AdamwDenseParamSparseGradKernel(
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
...
...
paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
浏览文件 @
35d7d1f0
...
...
@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/cpu/hsigmoid_loss_grad.h"
namespace
phi
{
...
...
@@ -54,7 +54,7 @@ void HSigmoidLossGradKernel(const Context& ctx,
PADDLE_ENFORCE_NOT_NULL
(
path
.
get_ptr
(),
errors
::
NotFound
(
"Custom tree must be set for sparse mode!"
));
p
addle
::
framework
::
Vector
<
int64_t
>
real_rows
=
PathToRows
(
*
path
);
p
hi
::
Vector
<
int64_t
>
real_rows
=
PathToRows
(
*
path
);
w_grad
->
set_rows
(
real_rows
);
// Build a map of id -> row_index to speed up finding the index of one id
w_grad
->
set_height
(
w
.
dims
()[
0
]);
...
...
paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
浏览文件 @
35d7d1f0
...
...
@@ -221,7 +221,7 @@ void ComputeRowImpl(const Context& dev_ctx,
auto
&
grad_tensor
=
grad_merge
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
grad_merge_rows
=
&
grad_merge
.
rows
();
p
addle
::
framework
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
p
hi
::
MixVector
<
int64_t
>
mixv_grad_merge_rows
(
grad_merge_rows
);
const
int64_t
*
rows
=
mixv_grad_merge_rows
.
Data
(
dev_ctx
.
GetPlace
());
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
if
(
paddle
::
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
&&
...
...
paddle/phi/tests/core/CMakeLists.txt
浏览文件 @
35d7d1f0
...
...
@@ -70,3 +70,20 @@ cc_test(
test_tensor_array
SRCS test_tensor_array.cc
DEPS tensor_array
)
if
(
WITH_GPU
)
nv_test
(
test_mixed_vector
SRCS test_mixed_vector.cc test_mixed_vector.cu
DEPS mixed_vector place memory device_context tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
test_mixed_vector
SRCS test_mixed_vector.cc test_mixed_vector.cu
DEPS mixed_vector place memory device_context tensor
)
else
()
cc_test
(
test_mixed_vector
SRCS test_mixed_vector.cc
DEPS mixed_vector place memory device_context tensor
)
endif
()
paddle/
fluid/framework/mixed_vector_test
.cc
→
paddle/
phi/tests/core/test_mixed_vector
.cc
浏览文件 @
35d7d1f0
...
...
@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/
fluid/framework
/mixed_vector.h"
#include "paddle/
phi/core
/mixed_vector.h"
#include "glog/logging.h"
#include "gtest/gtest-message.h"
...
...
@@ -21,7 +21,7 @@
#include "gtest/gtest_pred_impl.h"
template
<
typename
T
>
using
vec
=
p
addle
::
framework
::
Vector
<
T
>
;
using
vec
=
p
hi
::
Vector
<
T
>
;
TEST
(
mixed_vector
,
CPU_VECTOR
)
{
vec
<
int
>
tmp
;
...
...
@@ -44,7 +44,7 @@ TEST(mixed_vector, CPU_VECTOR) {
}
TEST
(
mixed_vector
,
InitWithCount
)
{
p
addle
::
framework
::
Vector
<
int
>
vec
(
10
,
10
);
p
hi
::
Vector
<
int
>
vec
(
10
,
10
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
vec
[
i
],
10
);
}
...
...
@@ -58,7 +58,7 @@ TEST(mixed_vector, ForEach) {
}
TEST
(
mixed_vector
,
Reserve
)
{
p
addle
::
framework
::
Vector
<
int
>
vec
;
p
hi
::
Vector
<
int
>
vec
;
vec
.
reserve
(
1
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
...
...
@@ -66,7 +66,7 @@ TEST(mixed_vector, Reserve) {
}
TEST
(
mixed_vector
,
Resize
)
{
p
addle
::
framework
::
Vector
<
int
>
vec
;
p
hi
::
Vector
<
int
>
vec
;
vec
.
resize
(
1
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
...
...
paddle/
fluid/framework/mixed_vector_test
.cu
→
paddle/
phi/tests/core/test_mixed_vector
.cu
浏览文件 @
35d7d1f0
...
...
@@ -23,13 +23,14 @@
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/mixed_vector.h"
template
<
typename
T
>
using
vec
=
p
addle
::
framework
::
MixVector
<
T
>
;
using
gpuStream_t
=
p
addle
::
gpuStream_t
;
using
vec
=
p
hi
::
MixVector
<
T
>
;
using
gpuStream_t
=
p
hi
::
gpuStream_t
;
static
__global__
void
multiply_10
(
int
*
ptr
)
{
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
...
...
@@ -37,9 +38,9 @@ static __global__ void multiply_10(int* ptr) {
}
}
gpuStream_t
GetCUDAStream
(
p
addle
::
platform
::
CUDA
Place
place
)
{
gpuStream_t
GetCUDAStream
(
p
hi
::
GPU
Place
place
)
{
return
reinterpret_cast
<
const
phi
::
GPUContext
*>
(
p
addle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
p
hi
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
}
...
...
@@ -50,7 +51,7 @@ TEST(mixed_vector, GPU_VECTOR) {
}
vec
<
int
>
tmp
(
&
x
);
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
p
addle
::
platform
::
CUDA
Place
gpu
(
0
);
p
hi
::
GPU
Place
gpu
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
multiply_10
,
...
...
@@ -69,7 +70,7 @@ TEST(mixed_vector, GPU_VECTOR) {
}
TEST
(
mixed_vector
,
MultiGPU
)
{
if
(
p
addle
::
platform
::
GetGPUDeviceCount
()
<
2
)
{
if
(
p
hi
::
backends
::
gpu
::
GetGPUDeviceCount
()
<
2
)
{
LOG
(
WARNING
)
<<
"Skip mixed_vector.MultiGPU since there are not multiple "
"GPUs in your machine."
;
return
;
...
...
@@ -81,8 +82,8 @@ TEST(mixed_vector, MultiGPU) {
}
vec
<
int
>
tmp
(
&
x
);
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
p
addle
::
platform
::
CUDA
Place
gpu0
(
0
);
p
addle
::
platform
::
SetDeviceId
(
0
);
p
hi
::
GPU
Place
gpu0
(
0
);
p
hi
::
backends
::
gpu
::
SetDeviceId
(
0
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
multiply_10
,
...
...
@@ -94,9 +95,9 @@ TEST(mixed_vector, MultiGPU) {
#else
multiply_10
<<<
1
,
1
,
0
,
GetCUDAStream
(
gpu0
)
>>>
(
tmp
.
MutableData
(
gpu0
));
#endif
p
addle
::
platform
::
CUDA
Place
gpu1
(
1
);
p
hi
::
GPU
Place
gpu1
(
1
);
auto
*
gpu1_ptr
=
tmp
.
MutableData
(
gpu1
);
p
addle
::
platform
::
SetDeviceId
(
1
);
p
hi
::
backends
::
gpu
::
SetDeviceId
(
1
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
...
...
tools/parallel_UT_rule.py
浏览文件 @
35d7d1f0
...
...
@@ -913,7 +913,7 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
'test_mix_precision_all_reduce_fuse'
,
'test_spp_op'
,
'test_op_converter'
,
'
mixed_vector_test
'
,
'
test_mixed_vector
'
,
'test_roi_align_op'
,
'test_pad_constant_like'
,
'test_mul_op'
,
...
...
@@ -2288,7 +2288,7 @@ TETRAD_PARALLEL_JOB = [
'device_context_test'
,
'test_reference_count_pass_last_lived_ops'
,
'copy_same_tensor_test'
,
'
mixed_vector_test
'
,
'
test_mixed_vector
'
,
'op_registry_test'
,
'test_prepare_op'
,
'data_device_transform_test'
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录