Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d7ccd6bf
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
d7ccd6bf
编写于
3月 18, 2022
作者:
X
xiongkun
提交者:
GitHub
3月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[phi] tranfer kthvalue from fluid to phi (#40676)
* tranfer kthvalue from fluid to phi * transfer infershape
上级
8c713223
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
822 addition
and
620 deletion
+822
-620
paddle/fluid/operators/kthvalue_op.cc
paddle/fluid/operators/kthvalue_op.cc
+7
-61
paddle/fluid/operators/kthvalue_op.cu
paddle/fluid/operators/kthvalue_op.cu
+0
-278
paddle/fluid/operators/kthvalue_op.h
paddle/fluid/operators/kthvalue_op.h
+0
-281
paddle/phi/infermeta/unary.cc
paddle/phi/infermeta/unary.cc
+61
-0
paddle/phi/infermeta/unary.h
paddle/phi/infermeta/unary.h
+8
-0
paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
+168
-0
paddle/phi/kernels/cpu/kthvalue_kernel.cc
paddle/phi/kernels/cpu/kthvalue_kernel.cc
+167
-0
paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
+70
-0
paddle/phi/kernels/gpu/kthvalue_kernel.cu
paddle/phi/kernels/gpu/kthvalue_kernel.cu
+252
-0
paddle/phi/kernels/kthvalue_grad_kernel.h
paddle/phi/kernels/kthvalue_grad_kernel.h
+30
-0
paddle/phi/kernels/kthvalue_kernel.h
paddle/phi/kernels/kthvalue_kernel.h
+30
-0
paddle/phi/ops/compat/kthvalue_sig.cc
paddle/phi/ops/compat/kthvalue_sig.cc
+29
-0
未找到文件。
paddle/fluid/operators/kthvalue_op.cc
浏览文件 @
d7ccd6bf
...
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/kthvalue_op.h"
#include <memory>
#include <memory>
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/infermeta/unary.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel {
...
@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel {
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"kthvalue"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"kthvalue"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Indices"
),
"Output"
,
"Indices"
,
"kthvalue"
);
auto
input_dims
=
ctx
->
GetInputDim
(
"X"
);
const
int
&
dim_size
=
input_dims
.
size
();
int
axis
=
static_cast
<
int
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
));
PADDLE_ENFORCE_LT
(
axis
,
dim_size
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"the axis must be [-%d, %d), but received %d ."
,
dim_size
,
dim_size
,
axis
));
PADDLE_ENFORCE_GE
(
axis
,
-
dim_size
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"the axis must be [-%d, %d), but received %d ."
,
dim_size
,
dim_size
,
axis
));
if
(
axis
<
0
)
axis
+=
dim_size
;
int
k
=
static_cast
<
int
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"k"
));
PADDLE_ENFORCE_GE
(
k
,
1
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"the k in the kthvalue must >= 1, but received %d ."
,
k
));
PADDLE_ENFORCE_GE
(
input_dims
.
size
(),
1
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"input of kthvalue must have >= 1d shape"
));
if
(
ctx
->
IsRuntime
())
{
PADDLE_ENFORCE_GE
(
input_dims
[
axis
],
k
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"input of kthvalue must have >= %d columns in axis of %d"
,
k
,
axis
));
}
bool
keepdim
=
ctx
->
Attrs
().
Get
<
bool
>
(
"keepdim"
);
std
::
vector
<
int64_t
>
dimvec
;
for
(
int64_t
i
=
0
;
i
<
axis
;
i
++
)
{
dimvec
.
emplace_back
(
input_dims
[
i
]);
}
if
(
keepdim
)
{
dimvec
.
emplace_back
(
static_cast
<
int64_t
>
(
1
));
}
for
(
int64_t
i
=
axis
+
1
;
i
<
dim_size
;
i
++
)
{
dimvec
.
emplace_back
(
input_dims
[
i
]);
}
framework
::
DDim
dims
=
phi
::
make_ddim
(
dimvec
);
ctx
->
SetOutputDim
(
"Out"
,
dims
);
ctx
->
SetOutputDim
(
"Indices"
,
dims
);
ctx
->
ShareLoD
(
"X"
,
"Out"
);
ctx
->
ShareLoD
(
"X"
,
"Indices"
);
}
protected:
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
...
@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
...
@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR
(
kthvalue
,
KthvalueInferShapeFunctor
,
PD_INFER_META
(
phi
::
KthvalueInferMeta
));
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
kthvalue
,
ops
::
KthvalueOp
,
ops
::
KthvalueOpMaker
,
REGISTER_OPERATOR
(
kthvalue
,
ops
::
KthvalueOp
,
ops
::
KthvalueOpMaker
,
ops
::
KthvalueGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
KthvalueGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
KthvalueGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
ops
::
KthvalueGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
REGISTER_OP_CPU_KERNEL
(
KthvalueInferShapeFunctor
);
kthvalue
,
ops
::
KthvalueCPUKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
,
ops
::
KthvalueCPUKernel
<
paddle
::
platform
::
CPUPlace
,
double
>
,
ops
::
KthvalueCPUKernel
<
paddle
::
platform
::
CPUPlace
,
int32_t
>
,
ops
::
KthvalueCPUKernel
<
paddle
::
platform
::
CPUPlace
,
int64_t
>
);
REGISTER_OPERATOR
(
kthvalue_grad
,
ops
::
KthvalueOpGrad
);
REGISTER_OPERATOR
(
kthvalue_grad
,
ops
::
KthvalueOpGrad
);
REGISTER_OP_CPU_KERNEL
(
kthvalue_grad
,
ops
::
KthvalueGradCPUKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
,
ops
::
KthvalueGradCPUKernel
<
paddle
::
platform
::
CPUPlace
,
double
>
,
ops
::
KthvalueGradCPUKernel
<
paddle
::
platform
::
CPUPlace
,
int32_t
>
,
ops
::
KthvalueGradCPUKernel
<
paddle
::
platform
::
CPUPlace
,
int64_t
>
);
paddle/fluid/operators/kthvalue_op.cu
已删除
100644 → 0
浏览文件 @
8c713223
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/kthvalue_op.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
namespace
paddle
{
namespace
operators
{
int
getBlockSize
(
int
col
)
{
if
(
col
>
512
)
return
1024
;
else
if
(
col
>
256
&&
col
<=
512
)
return
512
;
else
if
(
col
>
128
&&
col
<=
256
)
return
256
;
else
if
(
col
>
64
&&
col
<=
128
)
return
128
;
else
return
64
;
}
template
<
typename
T
>
bool
SortKthvalue
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
framework
::
Tensor
*
input_tensor
,
const
int64_t
num_cols
,
const
int64_t
num_rows
,
const
int
k
,
framework
::
Tensor
*
out_tensor
,
framework
::
Tensor
*
indices_tensor
)
{
auto
cu_stream
=
ctx
.
stream
();
framework
::
Tensor
input_indices
;
const
std
::
vector
<
int64_t
>
dims
=
{
num_rows
,
num_cols
};
auto
dim
=
phi
::
make_ddim
(
dims
);
input_indices
.
Resize
(
dim
);
input_indices
.
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
size_t
temp_storage_bytes
=
-
1
;
int
block_size
=
getBlockSize
(
num_cols
);
unsigned
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()[
0
];
unsigned
int
grid_size
=
num_rows
<
maxGridDimX
?
static_cast
<
unsigned
int
>
(
num_rows
)
:
maxGridDimX
;
InitIndex
<
int64_t
><<<
grid_size
,
block_size
,
0
,
cu_stream
>>>
(
input_indices
.
data
<
int64_t
>
(),
num_rows
,
num_cols
);
cub
::
CountingInputIterator
<
int64_t
>
counting_iter
(
0
);
cub
::
TransformInputIterator
<
int64_t
,
SegmentOffsetIter
,
cub
::
CountingInputIterator
<
int64_t
>>
segment_offsets_t
(
counting_iter
,
SegmentOffsetIter
(
num_cols
));
T
*
sorted_values_ptr
;
int64_t
*
sorted_indices_ptr
;
framework
::
Tensor
temp_values
,
temp_indices
;
const
T
*
input
=
input_tensor
->
data
<
T
>
();
T
*
values
=
out_tensor
->
data
<
T
>
();
int64_t
*
indices
=
indices_tensor
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
temp_values
.
Resize
(
dim
);
temp_indices
.
Resize
(
dim
);
sorted_values_ptr
=
temp_values
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
sorted_indices_ptr
=
temp_indices
.
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
auto
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
nullptr
,
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
framework
::
Tensor
temp_storage
;
temp_storage
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
(),
temp_storage_bytes
);
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
temp_storage
.
data
<
uint8_t
>
(),
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, "
<<
temp_storage_bytes
<<
", status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, "
<<
temp_storage_bytes
<<
", status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
auto
&
dev
=
*
ctx
.
eigen_device
();
const
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
2
>
slice_indices
{
0
,
k
-
1
};
const
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
2
>
slice_sizes
{
num_rows
,
1
};
auto
e_indices
=
framework
::
EigenMatrix
<
int64_t
>::
From
(
*
indices_tensor
,
dim
);
auto
e_tmp_indices
=
framework
::
EigenMatrix
<
int64_t
>::
From
(
static_cast
<
const
framework
::
Tensor
>
(
temp_indices
));
std
::
vector
<
int
>
odims
=
{
static_cast
<
int
>
(
num_rows
),
static_cast
<
int
>
(
1
)};
dim
=
phi
::
make_ddim
(
odims
);
auto
e_values
=
framework
::
EigenMatrix
<
T
>::
From
(
*
out_tensor
,
dim
);
auto
e_tmp_values
=
framework
::
EigenMatrix
<
T
>::
From
(
static_cast
<
const
framework
::
Tensor
>
(
temp_values
));
EigenSlice
<
std
::
decay_t
<
decltype
(
dev
)
>
,
int64_t
,
2
>::
Eval
(
dev
,
e_indices
,
e_tmp_indices
,
slice_indices
,
slice_sizes
);
EigenSlice
<
std
::
decay_t
<
decltype
(
dev
)
>
,
T
,
2
>::
Eval
(
dev
,
e_values
,
e_tmp_values
,
slice_indices
,
slice_sizes
);
return
true
;
}
template
<
typename
DeviceContext
,
typename
T
>
class
KthvalueOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
InvalidArgument
(
"It must use CUDAPlace, you must check your device set."
));
auto
*
input
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
indices
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Indices"
);
int
k
=
static_cast
<
int
>
(
ctx
.
Attr
<
int
>
(
"k"
));
int
axis
=
static_cast
<
int
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
bool
keepdim
=
static_cast
<
bool
>
(
ctx
.
Attr
<
bool
>
(
"keepdim"
));
const
auto
&
in_dims
=
input
->
dims
();
if
(
axis
<
0
)
axis
+=
in_dims
.
size
();
auto
out_dims
=
output
->
dims
();
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
*
indices_data
=
indices
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
if
(
axis
==
in_dims
.
size
()
-
1
)
{
const
int64_t
&
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
in_dims
,
0
,
in_dims
.
size
()
-
1
));
const
int64_t
&
input_width
=
in_dims
[
in_dims
.
size
()
-
1
];
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
PADDLE_ENFORCE_EQ
(
SortKthvalue
<
T
>
(
dev_ctx
,
input
,
input_width
,
input_height
,
k
,
output
,
indices
),
true
,
platform
::
errors
::
External
(
"KthvalueOP: Error when use cub sorting"
));
return
;
}
else
{
std
::
vector
<
int
>
trans
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
in_dims
.
size
()
-
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
()
-
1
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
axis
);
if
(
!
keepdim
)
{
std
::
vector
<
int
>
tmp_out_shape
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
tmp_out_shape
.
emplace_back
(
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
();
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
framework
::
DDim
tmp_out_dims
=
phi
::
make_ddim
(
tmp_out_shape
);
output
->
Resize
(
tmp_out_dims
);
indices
->
Resize
(
tmp_out_dims
);
}
framework
::
DDim
trans_dims
(
in_dims
);
framework
::
DDim
trans_out_dims
(
in_dims
);
for
(
int
i
=
0
;
i
<
trans
.
size
();
i
++
)
{
trans_dims
[
i
]
=
in_dims
[
trans
[
i
]];
trans_out_dims
[
i
]
=
in_dims
[
trans
[
i
]];
}
trans_out_dims
[
in_dims
.
size
()
-
1
]
=
1
;
framework
::
Tensor
trans_input
;
trans_input
.
mutable_data
<
T
>
(
trans_dims
,
ctx
.
GetPlace
());
int
ndims
=
trans
.
size
();
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
TransCompute
<
platform
::
CUDADeviceContext
,
T
>
(
ndims
,
dev_ctx
,
*
input
,
&
trans_input
,
trans
);
framework
::
Tensor
trans_ind
,
trans_out
;
trans_ind
.
mutable_data
<
int64_t
>
(
trans_out_dims
,
ctx
.
GetPlace
());
trans_out
.
mutable_data
<
T
>
(
trans_out_dims
,
ctx
.
GetPlace
());
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
trans_dims
,
0
,
trans_dims
.
size
()
-
1
));
const
int64_t
input_width
=
trans_dims
[
trans_dims
.
size
()
-
1
];
PADDLE_ENFORCE_EQ
(
SortKthvalue
<
T
>
(
dev_ctx
,
&
trans_input
,
input_width
,
input_height
,
k
,
&
trans_out
,
&
trans_ind
),
true
,
platform
::
errors
::
External
(
"KthvalueOP: Error when use cub sorting"
));
TransCompute
<
platform
::
CUDADeviceContext
,
int64_t
>
(
ndims
,
dev_ctx
,
trans_ind
,
indices
,
trans
);
TransCompute
<
platform
::
CUDADeviceContext
,
T
>
(
ndims
,
dev_ctx
,
trans_out
,
output
,
trans
);
if
(
!
keepdim
)
{
output
->
Resize
(
out_dims
);
indices
->
Resize
(
out_dims
);
}
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
KthvalueOpGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
context
.
GetPlace
()),
true
,
platform
::
errors
::
InvalidArgument
(
"It must use CUDAPlace, you must check your device set."
));
auto
*
x
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
out_grad
=
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
indices
=
context
.
Input
<
framework
::
Tensor
>
(
"Indices"
);
auto
*
x_grad
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int
axis
=
context
.
Attr
<
int
>
(
"axis"
);
int
k
=
static_cast
<
int
>
(
context
.
Attr
<
int
>
(
"k"
));
const
auto
&
in_dims
=
x
->
dims
();
auto
out_dims
=
indices
->
dims
();
if
(
axis
<
0
)
axis
+=
in_dims
.
size
();
T
*
x_grad_data
=
x_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
out_grad_data
=
out_grad
->
data
<
T
>
();
const
int64_t
*
indices_data
=
indices
->
data
<
int64_t
>
();
int
pre
,
n
,
post
;
GetDims
(
in_dims
,
axis
,
&
pre
,
&
n
,
&
post
);
auto
&
dev_ctx
=
context
.
cuda_device_context
();
int
block_size
=
getBlockSize
(
post
*
k
);
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(((
max_threads
-
1
)
/
block_size
+
1
),
1
);
int
grid_size
=
std
::
min
(
max_blocks
,
pre
);
AssignGradWithAxis
<
T
><<<
grid_size
,
block_size
,
64
*
4
,
dev_ctx
.
stream
()
>>>
(
out_grad_data
,
indices_data
,
x_grad_data
,
pre
,
post
,
n
,
1
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
kthvalue
,
ops
::
KthvalueOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
KthvalueOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
KthvalueOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
KthvalueOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
kthvalue_grad
,
ops
::
KthvalueOpGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
KthvalueOpGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
KthvalueOpGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
KthvalueOpGradCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/kthvalue_op.h
已删除
100644 → 0
浏览文件 @
8c713223
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <iostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/transpose_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
typename
Type
>
static
void
getKthvalue
(
Type
input_height
,
Type
input_width
,
int
input_dim
,
const
framework
::
Tensor
*
input
,
T
*
t_out
,
Type
*
t_indices
,
const
int
&
k
)
{
bool
partial_sort_flag
=
(
k
*
64
)
<
input_width
;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for
(
Type
i
=
0
;
i
<
input_height
;
++
i
)
{
std
::
vector
<
std
::
pair
<
T
,
Type
>>
col_vec
;
col_vec
.
reserve
(
input_width
);
if
(
input_dim
==
1
)
{
auto
e_input
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
input
);
for
(
Type
j
=
0
;
j
<
input_width
;
++
j
)
{
col_vec
.
emplace_back
(
std
::
pair
<
T
,
Type
>
(
e_input
(
j
),
j
));
}
}
else
{
auto
e_input
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
input
,
input_dim
-
1
);
for
(
Type
j
=
0
;
j
<
input_width
;
++
j
)
{
col_vec
.
emplace_back
(
std
::
pair
<
T
,
Type
>
(
e_input
(
i
,
j
),
j
));
}
}
if
(
partial_sort_flag
)
{
std
::
partial_sort
(
col_vec
.
begin
(),
col_vec
.
begin
()
+
k
,
col_vec
.
end
(),
[](
const
std
::
pair
<
T
,
Type
>&
l
,
const
std
::
pair
<
T
,
Type
>&
r
)
{
return
(
!
std
::
isnan
(
static_cast
<
double
>
(
l
.
first
))
&&
std
::
isnan
(
static_cast
<
double
>
(
r
.
first
)))
||
(
l
.
first
<
r
.
first
);
});
}
else
{
std
::
nth_element
(
col_vec
.
begin
(),
col_vec
.
begin
()
+
k
-
1
,
col_vec
.
end
(),
[](
const
std
::
pair
<
T
,
Type
>&
l
,
const
std
::
pair
<
T
,
Type
>&
r
)
{
return
(
!
std
::
isnan
(
static_cast
<
double
>
(
l
.
first
))
&&
std
::
isnan
(
static_cast
<
double
>
(
r
.
first
)))
||
(
l
.
first
<
r
.
first
);
});
}
t_out
[
i
]
=
col_vec
[
k
-
1
].
first
;
t_indices
[
i
]
=
col_vec
[
k
-
1
].
second
;
}
}
template
<
typename
T
,
typename
Type
>
static
void
kthvalueAssign
(
const
Type
&
input_height
,
const
Type
&
input_width
,
const
int
&
input_dim
,
const
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
indices
,
T
*
output_data
)
{
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for
(
Type
i
=
0
;
i
<
input_height
;
++
i
)
{
if
(
input_dim
==
1
)
{
auto
e_input
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
input
);
auto
e_indices
=
framework
::
EigenVector
<
Type
>::
Flatten
(
*
indices
);
output_data
[
i
*
input_width
+
e_indices
(
0
)]
=
e_input
(
0
);
}
else
{
auto
e_input
=
framework
::
EigenMatrix
<
T
>::
Reshape
(
*
input
,
input_dim
-
1
);
auto
e_indices
=
framework
::
EigenMatrix
<
Type
>::
Reshape
(
*
indices
,
input_dim
-
1
);
output_data
[
i
*
input_width
+
e_indices
(
i
,
0
)]
=
e_input
(
i
,
0
);
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
KthvalueCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
input
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
indices
=
context
.
Output
<
framework
::
Tensor
>
(
"Indices"
);
const
auto
&
in_dims
=
input
->
dims
();
int
k
=
static_cast
<
int
>
(
context
.
Attr
<
int
>
(
"k"
));
bool
keepdim
=
static_cast
<
bool
>
(
context
.
Attr
<
bool
>
(
"keepdim"
));
int
axis
=
static_cast
<
int
>
(
context
.
Attr
<
int
>
(
"axis"
));
if
(
axis
<
0
)
axis
+=
in_dims
.
size
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int64_t
*
indices_data
=
indices
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
auto
out_dims
=
output
->
dims
();
if
(
axis
==
in_dims
.
size
()
-
1
)
{
const
int64_t
&
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
in_dims
,
0
,
in_dims
.
size
()
-
1
));
const
int64_t
&
input_width
=
in_dims
[
in_dims
.
size
()
-
1
];
getKthvalue
<
T
,
int64_t
>
(
input_height
,
input_width
,
in_dims
.
size
(),
input
,
output_data
,
indices_data
,
k
);
}
else
{
std
::
vector
<
int
>
trans
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
in_dims
.
size
()
-
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
()
-
1
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
axis
);
if
(
!
keepdim
)
{
std
::
vector
<
int
>
tmp_out_shape
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
tmp_out_shape
.
emplace_back
(
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
();
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
framework
::
DDim
tmp_out_dims
=
phi
::
make_ddim
(
tmp_out_shape
);
output
->
Resize
(
tmp_out_dims
);
indices
->
Resize
(
tmp_out_dims
);
}
framework
::
DDim
trans_dims
(
in_dims
);
framework
::
DDim
trans_out_dims
(
in_dims
);
for
(
size_t
i
=
0
;
i
<
trans
.
size
();
i
++
)
{
trans_dims
[
i
]
=
in_dims
[
trans
[
i
]];
trans_out_dims
[
i
]
=
in_dims
[
trans
[
i
]];
}
trans_out_dims
[
in_dims
.
size
()
-
1
]
=
1
;
framework
::
Tensor
trans_inp
;
trans_inp
.
mutable_data
<
T
>
(
trans_dims
,
context
.
GetPlace
());
int
ndims
=
trans
.
size
();
auto
&
dev_context
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
TransCompute
<
platform
::
CPUDeviceContext
,
T
>
(
ndims
,
dev_context
,
*
input
,
&
trans_inp
,
trans
);
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
trans_dims
,
0
,
trans_dims
.
size
()
-
1
));
const
int64_t
input_width
=
trans_dims
[
trans_dims
.
size
()
-
1
];
framework
::
Tensor
tmp_out
,
tmp_indices
;
T
*
t_out
=
tmp_out
.
mutable_data
<
T
>
(
trans_out_dims
,
context
.
GetPlace
());
auto
*
t_ind
=
tmp_indices
.
mutable_data
<
int64_t
>
(
trans_out_dims
,
context
.
GetPlace
());
getKthvalue
<
T
,
int64_t
>
(
input_height
,
input_width
,
in_dims
.
size
(),
&
trans_inp
,
t_out
,
t_ind
,
k
);
TransCompute
<
platform
::
CPUDeviceContext
,
int64_t
>
(
ndims
,
dev_context
,
tmp_indices
,
indices
,
trans
);
TransCompute
<
platform
::
CPUDeviceContext
,
T
>
(
ndims
,
dev_context
,
tmp_out
,
output
,
trans
);
if
(
!
keepdim
)
{
output
->
Resize
(
out_dims
);
indices
->
Resize
(
out_dims
);
}
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
KthvalueGradCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
out_grad
=
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
indices
=
context
.
Input
<
framework
::
Tensor
>
(
"Indices"
);
auto
*
x_grad
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int
axis
=
static_cast
<
int
>
(
context
.
Attr
<
int
>
(
"axis"
));
bool
keepdim
=
static_cast
<
bool
>
(
context
.
Attr
<
bool
>
(
"keepdim"
));
auto
in_dims
=
x
->
dims
();
auto
out_dims
=
indices
->
dims
();
axis
=
(
axis
<
0
)
?
(
in_dims
.
size
()
+
axis
)
:
axis
;
if
(
!
keepdim
)
{
std
::
vector
<
int
>
tmp_out_shape
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
tmp_out_shape
.
emplace_back
(
out_dims
[
i
]);
}
tmp_out_shape
.
emplace_back
(
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
();
i
++
)
{
tmp_out_shape
.
emplace_back
(
out_dims
[
i
-
1
]);
}
out_dims
=
phi
::
make_ddim
(
tmp_out_shape
);
}
T
*
x_grad_data
=
x_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
axis
==
in_dims
.
size
()
-
1
)
{
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
in_dims
,
0
,
in_dims
.
size
()
-
1
));
const
int64_t
input_width
=
in_dims
[
in_dims
.
size
()
-
1
];
memset
(
x_grad_data
,
0
,
x_grad
->
numel
()
*
sizeof
(
T
));
if
(
keepdim
)
{
kthvalueAssign
(
input_height
,
input_width
,
in_dims
.
size
(),
out_grad
,
indices
,
x_grad_data
);
}
else
{
auto
&
dev_context
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
framework
::
Tensor
out_grad_tmp
,
indices_tmp
;
out_grad_tmp
.
mutable_data
<
T
>
(
out_grad
->
dims
(),
dev_context
.
GetPlace
());
indices_tmp
.
mutable_data
<
int64_t
>
(
indices
->
dims
(),
dev_context
.
GetPlace
());
framework
::
TensorCopy
(
*
out_grad
,
dev_context
.
GetPlace
(),
dev_context
,
&
out_grad_tmp
);
framework
::
TensorCopy
(
*
indices
,
dev_context
.
GetPlace
(),
dev_context
,
&
indices_tmp
);
out_grad_tmp
.
Resize
(
out_dims
);
indices_tmp
.
Resize
(
out_dims
);
kthvalueAssign
(
input_height
,
input_width
,
in_dims
.
size
(),
&
out_grad_tmp
,
&
indices_tmp
,
x_grad_data
);
}
}
else
{
std
::
vector
<
int
>
trans
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
out_dims
.
size
()
-
1
);
for
(
int
i
=
axis
+
1
;
i
<
out_dims
.
size
()
-
1
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
axis
);
framework
::
DDim
trans_dims
(
out_dims
);
framework
::
DDim
trans_in_dims
(
in_dims
);
for
(
size_t
i
=
0
;
i
<
trans
.
size
();
i
++
)
{
trans_dims
[
i
]
=
out_dims
[
trans
[
i
]];
trans_in_dims
[
i
]
=
in_dims
[
trans
[
i
]];
}
framework
::
Tensor
trans_dO
,
trans_ind
;
trans_dO
.
mutable_data
<
T
>
(
trans_dims
,
context
.
GetPlace
());
trans_ind
.
mutable_data
<
int64_t
>
(
trans_dims
,
context
.
GetPlace
());
int
ndims
=
trans
.
size
();
auto
&
dev_context
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
if
(
keepdim
)
{
TransCompute
<
platform
::
CPUDeviceContext
,
T
>
(
ndims
,
dev_context
,
*
out_grad
,
&
trans_dO
,
trans
);
TransCompute
<
platform
::
CPUDeviceContext
,
int64_t
>
(
ndims
,
dev_context
,
*
indices
,
&
trans_ind
,
trans
);
}
else
{
framework
::
Tensor
out_grad_tmp
,
indices_tmp
;
out_grad_tmp
.
mutable_data
<
T
>
(
out_grad
->
dims
(),
dev_context
.
GetPlace
());
indices_tmp
.
mutable_data
<
int64_t
>
(
indices
->
dims
(),
dev_context
.
GetPlace
());
framework
::
TensorCopy
(
*
out_grad
,
dev_context
.
GetPlace
(),
dev_context
,
&
out_grad_tmp
);
framework
::
TensorCopy
(
*
indices
,
dev_context
.
GetPlace
(),
dev_context
,
&
indices_tmp
);
out_grad_tmp
.
Resize
(
out_dims
);
indices_tmp
.
Resize
(
out_dims
);
TransCompute
<
platform
::
CPUDeviceContext
,
T
>
(
ndims
,
dev_context
,
out_grad_tmp
,
&
trans_dO
,
trans
);
TransCompute
<
platform
::
CPUDeviceContext
,
int64_t
>
(
ndims
,
dev_context
,
indices_tmp
,
&
trans_ind
,
trans
);
}
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
trans_in_dims
,
0
,
trans_in_dims
.
size
()
-
1
));
const
int64_t
input_width
=
trans_in_dims
[
trans_in_dims
.
size
()
-
1
];
framework
::
Tensor
tmp_out
;
T
*
t_out
=
tmp_out
.
mutable_data
<
T
>
(
trans_in_dims
,
context
.
GetPlace
());
memset
(
t_out
,
0
,
x_grad
->
numel
()
*
sizeof
(
T
));
kthvalueAssign
<
T
,
int64_t
>
(
input_height
,
input_width
,
in_dims
.
size
(),
&
trans_dO
,
&
trans_ind
,
t_out
);
TransCompute
<
platform
::
CPUDeviceContext
,
T
>
(
ndims
,
dev_context
,
tmp_out
,
x_grad
,
trans
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/phi/infermeta/unary.cc
浏览文件 @
d7ccd6bf
...
@@ -554,6 +554,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
...
@@ -554,6 +554,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
out
->
set_dtype
(
DataType
::
BOOL
);
out
->
set_dtype
(
DataType
::
BOOL
);
}
}
void
KthvalueInferMeta
(
const
MetaTensor
&
x
,
int
k
,
int
axis
,
bool
keepdim
,
MetaTensor
*
out
,
MetaTensor
*
indices
,
MetaConfig
config
)
{
auto
input_dims
=
x
.
dims
();
const
int
&
dim_size
=
input_dims
.
size
();
PADDLE_ENFORCE_LT
(
axis
,
dim_size
,
phi
::
errors
::
InvalidArgument
(
"the axis must be [-%d, %d), but received %d ."
,
dim_size
,
dim_size
,
axis
));
PADDLE_ENFORCE_GE
(
axis
,
-
dim_size
,
phi
::
errors
::
InvalidArgument
(
"the axis must be [-%d, %d), but received %d ."
,
dim_size
,
dim_size
,
axis
));
if
(
axis
<
0
)
axis
+=
dim_size
;
PADDLE_ENFORCE_GE
(
k
,
1
,
phi
::
errors
::
InvalidArgument
(
"the k in the kthvalue must >= 1, but received %d ."
,
k
));
PADDLE_ENFORCE_GE
(
input_dims
.
size
(),
1
,
phi
::
errors
::
InvalidArgument
(
"input of kthvalue must have >= 1d shape"
));
if
(
config
.
is_runtime
)
{
PADDLE_ENFORCE_GE
(
input_dims
[
axis
],
k
,
phi
::
errors
::
InvalidArgument
(
"input of kthvalue must have >= %d columns in axis of %d"
,
k
,
axis
));
}
std
::
vector
<
int64_t
>
dimvec
;
for
(
int64_t
i
=
0
;
i
<
axis
;
i
++
)
{
dimvec
.
emplace_back
(
input_dims
[
i
]);
}
if
(
keepdim
)
{
dimvec
.
emplace_back
(
static_cast
<
int64_t
>
(
1
));
}
for
(
int64_t
i
=
axis
+
1
;
i
<
dim_size
;
i
++
)
{
dimvec
.
emplace_back
(
input_dims
[
i
]);
}
DDim
dims
=
phi
::
make_ddim
(
dimvec
);
out
->
set_dims
(
dims
);
out
->
share_lod
(
x
);
out
->
set_dtype
(
x
.
dtype
());
indices
->
set_dims
(
dims
);
indices
->
share_lod
(
x
);
indices
->
set_dtype
(
x
.
dtype
());
}
void
MatrixPowerInferMeta
(
const
MetaTensor
&
x
,
int
n
,
MetaTensor
*
out
)
{
void
MatrixPowerInferMeta
(
const
MetaTensor
&
x
,
int
n
,
MetaTensor
*
out
)
{
auto
dims
=
x
.
dims
();
auto
dims
=
x
.
dims
();
auto
n_dim
=
dims
.
size
();
auto
n_dim
=
dims
.
size
();
...
...
paddle/phi/infermeta/unary.h
浏览文件 @
d7ccd6bf
...
@@ -100,6 +100,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
...
@@ -100,6 +100,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
void
IsfiniteInferMeta
(
const
MetaTensor
&
input
,
MetaTensor
*
out
);
void
IsfiniteInferMeta
(
const
MetaTensor
&
input
,
MetaTensor
*
out
);
void
KthvalueInferMeta
(
const
MetaTensor
&
x
,
int
k
,
int
axis
,
bool
keepdim
,
MetaTensor
*
out
,
MetaTensor
*
indices
,
MetaConfig
=
MetaConfig
());
void
MatrixPowerInferMeta
(
const
MetaTensor
&
x
,
int
n
,
MetaTensor
*
out
);
void
MatrixPowerInferMeta
(
const
MetaTensor
&
x
,
int
n
,
MetaTensor
*
out
);
void
MaxPoolWithIndexInferMeta
(
const
MetaTensor
&
x
,
void
MaxPoolWithIndexInferMeta
(
const
MetaTensor
&
x
,
...
...
paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
0 → 100644
浏览文件 @
d7ccd6bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
template
<
typename
T
,
typename
Type
>
static
void
kthvalueAssign
(
const
Type
&
input_height
,
const
Type
&
input_width
,
const
int
&
input_dim
,
const
DenseTensor
*
input
,
const
DenseTensor
*
indices
,
T
*
output_data
)
{
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for
(
Type
i
=
0
;
i
<
input_height
;
++
i
)
{
if
(
input_dim
==
1
)
{
auto
e_input
=
EigenVector
<
T
>::
Flatten
(
*
input
);
auto
e_indices
=
EigenVector
<
Type
>::
Flatten
(
*
indices
);
output_data
[
i
*
input_width
+
e_indices
(
0
)]
=
e_input
(
0
);
}
else
{
auto
e_input
=
EigenMatrix
<
T
>::
Reshape
(
*
input
,
input_dim
-
1
);
auto
e_indices
=
EigenMatrix
<
Type
>::
Reshape
(
*
indices
,
input_dim
-
1
);
output_data
[
i
*
input_width
+
e_indices
(
i
,
0
)]
=
e_input
(
i
,
0
);
}
}
}
template
<
typename
T
,
typename
Context
>
void
KthvalueGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
d_out
,
const
DenseTensor
&
x
,
const
DenseTensor
&
indices
,
int
k
,
int
axis
,
bool
keepdim
,
DenseTensor
*
d_x
)
{
auto
in_dims
=
x
.
dims
();
auto
out_dims
=
indices
.
dims
();
axis
=
(
axis
<
0
)
?
(
in_dims
.
size
()
+
axis
)
:
axis
;
if
(
!
keepdim
)
{
std
::
vector
<
int
>
tmp_out_shape
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
tmp_out_shape
.
emplace_back
(
out_dims
[
i
]);
}
tmp_out_shape
.
emplace_back
(
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
();
i
++
)
{
tmp_out_shape
.
emplace_back
(
out_dims
[
i
-
1
]);
}
out_dims
=
phi
::
make_ddim
(
tmp_out_shape
);
}
T
*
x_grad_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_x
);
if
(
axis
==
in_dims
.
size
()
-
1
)
{
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
in_dims
,
0
,
in_dims
.
size
()
-
1
));
const
int64_t
input_width
=
in_dims
[
in_dims
.
size
()
-
1
];
memset
(
x_grad_data
,
0
,
d_x
->
numel
()
*
sizeof
(
T
));
if
(
keepdim
)
{
kthvalueAssign
(
input_height
,
input_width
,
in_dims
.
size
(),
&
d_out
,
&
indices
,
x_grad_data
);
}
else
{
DenseTensor
out_grad_tmp
,
indices_tmp
;
out_grad_tmp
.
Resize
(
d_out
.
dims
());
indices_tmp
.
Resize
(
indices
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
&
out_grad_tmp
);
dev_ctx
.
template
Alloc
<
int64_t
>(
&
indices_tmp
);
Copy
(
dev_ctx
,
d_out
,
dev_ctx
.
GetPlace
(),
false
,
&
out_grad_tmp
);
Copy
(
dev_ctx
,
indices
,
dev_ctx
.
GetPlace
(),
false
,
&
indices_tmp
);
out_grad_tmp
.
Resize
(
out_dims
);
indices_tmp
.
Resize
(
out_dims
);
kthvalueAssign
(
input_height
,
input_width
,
in_dims
.
size
(),
&
out_grad_tmp
,
&
indices_tmp
,
x_grad_data
);
}
}
else
{
std
::
vector
<
int
>
trans
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
out_dims
.
size
()
-
1
);
for
(
int
i
=
axis
+
1
;
i
<
out_dims
.
size
()
-
1
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
axis
);
DDim
trans_dims
(
out_dims
);
DDim
trans_in_dims
(
in_dims
);
for
(
size_t
i
=
0
;
i
<
trans
.
size
();
i
++
)
{
trans_dims
[
i
]
=
out_dims
[
trans
[
i
]];
trans_in_dims
[
i
]
=
in_dims
[
trans
[
i
]];
}
DenseTensor
trans_dO
,
trans_ind
;
trans_dO
.
Resize
(
trans_dims
);
trans_ind
.
Resize
(
trans_dims
);
dev_ctx
.
template
Alloc
<
T
>(
&
trans_dO
);
dev_ctx
.
template
Alloc
<
int64_t
>(
&
trans_ind
);
int
ndims
=
trans
.
size
();
if
(
keepdim
)
{
funcs
::
TransCompute
<
phi
::
CPUContext
,
T
>
(
ndims
,
dev_ctx
,
d_out
,
&
trans_dO
,
trans
);
funcs
::
TransCompute
<
phi
::
CPUContext
,
int64_t
>
(
ndims
,
dev_ctx
,
indices
,
&
trans_ind
,
trans
);
}
else
{
DenseTensor
out_grad_tmp
,
indices_tmp
;
out_grad_tmp
.
Resize
(
d_out
.
dims
());
indices_tmp
.
Resize
(
indices
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
&
out_grad_tmp
);
dev_ctx
.
template
Alloc
<
int64_t
>(
&
indices_tmp
);
Copy
(
dev_ctx
,
d_out
,
dev_ctx
.
GetPlace
(),
false
,
&
out_grad_tmp
);
Copy
(
dev_ctx
,
indices
,
dev_ctx
.
GetPlace
(),
false
,
&
indices_tmp
);
out_grad_tmp
.
Resize
(
out_dims
);
indices_tmp
.
Resize
(
out_dims
);
funcs
::
TransCompute
<
phi
::
CPUContext
,
T
>
(
ndims
,
dev_ctx
,
out_grad_tmp
,
&
trans_dO
,
trans
);
funcs
::
TransCompute
<
phi
::
CPUContext
,
int64_t
>
(
ndims
,
dev_ctx
,
indices_tmp
,
&
trans_ind
,
trans
);
}
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
trans_in_dims
,
0
,
trans_in_dims
.
size
()
-
1
));
const
int64_t
input_width
=
trans_in_dims
[
trans_in_dims
.
size
()
-
1
];
DenseTensor
tmp_out
;
tmp_out
.
Resize
(
trans_in_dims
);
T
*
t_out
=
dev_ctx
.
template
Alloc
<
T
>(
&
tmp_out
);
memset
(
t_out
,
0
,
d_x
->
numel
()
*
sizeof
(
T
));
kthvalueAssign
<
T
,
int64_t
>
(
input_height
,
input_width
,
in_dims
.
size
(),
&
trans_dO
,
&
trans_ind
,
t_out
);
funcs
::
TransCompute
<
phi
::
CPUContext
,
T
>
(
ndims
,
dev_ctx
,
tmp_out
,
d_x
,
trans
);
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
kthvalue_grad
,
CPU
,
ALL_LAYOUT
,
phi
::
KthvalueGradKernel
,
float
,
double
,
int
,
int64_t
)
{}
paddle/phi/kernels/cpu/kthvalue_kernel.cc
0 → 100644
浏览文件 @
d7ccd6bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
template
<
typename
T
,
typename
Type
>
static
void
getKthvalue
(
Type
input_height
,
Type
input_width
,
int
input_dim
,
const
DenseTensor
*
input
,
T
*
t_out
,
Type
*
t_indices
,
const
int
&
k
)
{
bool
partial_sort_flag
=
(
k
*
64
)
<
input_width
;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for
(
Type
i
=
0
;
i
<
input_height
;
++
i
)
{
std
::
vector
<
std
::
pair
<
T
,
Type
>>
col_vec
;
col_vec
.
reserve
(
input_width
);
if
(
input_dim
==
1
)
{
auto
e_input
=
EigenVector
<
T
>::
Flatten
(
*
input
);
for
(
Type
j
=
0
;
j
<
input_width
;
++
j
)
{
col_vec
.
emplace_back
(
std
::
pair
<
T
,
Type
>
(
e_input
(
j
),
j
));
}
}
else
{
auto
e_input
=
EigenMatrix
<
T
>::
Reshape
(
*
input
,
input_dim
-
1
);
for
(
Type
j
=
0
;
j
<
input_width
;
++
j
)
{
col_vec
.
emplace_back
(
std
::
pair
<
T
,
Type
>
(
e_input
(
i
,
j
),
j
));
}
}
if
(
partial_sort_flag
)
{
std
::
partial_sort
(
col_vec
.
begin
(),
col_vec
.
begin
()
+
k
,
col_vec
.
end
(),
[](
const
std
::
pair
<
T
,
Type
>&
l
,
const
std
::
pair
<
T
,
Type
>&
r
)
{
return
(
!
std
::
isnan
(
static_cast
<
double
>
(
l
.
first
))
&&
std
::
isnan
(
static_cast
<
double
>
(
r
.
first
)))
||
(
l
.
first
<
r
.
first
);
});
}
else
{
std
::
nth_element
(
col_vec
.
begin
(),
col_vec
.
begin
()
+
k
-
1
,
col_vec
.
end
(),
[](
const
std
::
pair
<
T
,
Type
>&
l
,
const
std
::
pair
<
T
,
Type
>&
r
)
{
return
(
!
std
::
isnan
(
static_cast
<
double
>
(
l
.
first
))
&&
std
::
isnan
(
static_cast
<
double
>
(
r
.
first
)))
||
(
l
.
first
<
r
.
first
);
});
}
t_out
[
i
]
=
col_vec
[
k
-
1
].
first
;
t_indices
[
i
]
=
col_vec
[
k
-
1
].
second
;
}
}
template
<
typename
T
,
typename
Context
>
void
KthvalueKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
int
k
,
int
axis
,
bool
keepdim
,
DenseTensor
*
output
,
DenseTensor
*
indices
)
{
const
auto
&
in_dims
=
x
.
dims
();
if
(
axis
<
0
)
axis
+=
in_dims
.
size
();
T
*
output_data
=
dev_ctx
.
template
Alloc
<
T
>(
output
);
int64_t
*
indices_data
=
dev_ctx
.
template
Alloc
<
int64_t
>(
indices
);
auto
out_dims
=
output
->
dims
();
if
(
axis
==
in_dims
.
size
()
-
1
)
{
const
int64_t
&
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
in_dims
,
0
,
in_dims
.
size
()
-
1
));
const
int64_t
&
input_width
=
in_dims
[
in_dims
.
size
()
-
1
];
getKthvalue
<
T
,
int64_t
>
(
input_height
,
input_width
,
in_dims
.
size
(),
&
x
,
output_data
,
indices_data
,
k
);
}
else
{
std
::
vector
<
int
>
trans
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
in_dims
.
size
()
-
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
()
-
1
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
axis
);
if
(
!
keepdim
)
{
std
::
vector
<
int
>
tmp_out_shape
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
tmp_out_shape
.
emplace_back
(
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
();
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
DDim
tmp_out_dims
=
phi
::
make_ddim
(
tmp_out_shape
);
output
->
Resize
(
tmp_out_dims
);
indices
->
Resize
(
tmp_out_dims
);
}
DDim
trans_dims
(
in_dims
);
DDim
trans_out_dims
(
in_dims
);
for
(
size_t
i
=
0
;
i
<
trans
.
size
();
i
++
)
{
trans_dims
[
i
]
=
in_dims
[
trans
[
i
]];
trans_out_dims
[
i
]
=
in_dims
[
trans
[
i
]];
}
trans_out_dims
[
in_dims
.
size
()
-
1
]
=
1
;
DenseTensor
trans_inp
;
trans_inp
.
Resize
(
trans_dims
);
dev_ctx
.
template
Alloc
<
T
>(
&
trans_inp
);
int
ndims
=
trans
.
size
();
funcs
::
TransCompute
<
phi
::
CPUContext
,
T
>
(
ndims
,
dev_ctx
,
x
,
&
trans_inp
,
trans
);
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
trans_dims
,
0
,
trans_dims
.
size
()
-
1
));
const
int64_t
input_width
=
trans_dims
[
trans_dims
.
size
()
-
1
];
DenseTensor
tmp_out
,
tmp_indices
;
tmp_out
.
Resize
(
trans_out_dims
);
T
*
t_out
=
dev_ctx
.
template
Alloc
<
T
>(
&
tmp_out
);
tmp_indices
.
Resize
(
trans_out_dims
);
int64_t
*
t_ind
=
dev_ctx
.
template
Alloc
<
int64_t
>(
&
tmp_indices
);
getKthvalue
<
T
,
int64_t
>
(
input_height
,
input_width
,
in_dims
.
size
(),
&
trans_inp
,
t_out
,
t_ind
,
k
);
funcs
::
TransCompute
<
phi
::
CPUContext
,
int64_t
>
(
ndims
,
dev_ctx
,
tmp_indices
,
indices
,
trans
);
funcs
::
TransCompute
<
phi
::
CPUContext
,
T
>
(
ndims
,
dev_ctx
,
tmp_out
,
output
,
trans
);
if
(
!
keepdim
)
{
output
->
Resize
(
out_dims
);
indices
->
Resize
(
out_dims
);
}
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
kthvalue
,
CPU
,
ALL_LAYOUT
,
phi
::
KthvalueKernel
,
float
,
double
,
int
,
int64_t
)
{}
paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
0 → 100644
浏览文件 @
d7ccd6bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
static
int
getBlockSize
(
int
col
)
{
if
(
col
>
512
)
return
1024
;
else
if
(
col
>
256
&&
col
<=
512
)
return
512
;
else
if
(
col
>
128
&&
col
<=
256
)
return
256
;
else
if
(
col
>
64
&&
col
<=
128
)
return
128
;
else
return
64
;
}
template
<
typename
T
,
typename
Context
>
void
KthvalueGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
d_out
,
const
DenseTensor
&
x
,
const
DenseTensor
&
indices
,
int
k
,
int
axis
,
bool
keepdim
,
DenseTensor
*
d_x
)
{
const
auto
&
in_dims
=
x
.
dims
();
auto
out_dims
=
indices
.
dims
();
if
(
axis
<
0
)
axis
+=
in_dims
.
size
();
T
*
x_grad_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_x
);
const
T
*
out_grad_data
=
d_out
.
data
<
T
>
();
const
int64_t
*
indices_data
=
indices
.
data
<
int64_t
>
();
int
pre
,
n
,
post
;
paddle
::
operators
::
GetDims
(
in_dims
,
axis
,
&
pre
,
&
n
,
&
post
);
int
block_size
=
getBlockSize
(
post
*
k
);
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(((
max_threads
-
1
)
/
block_size
+
1
),
1
);
int
grid_size
=
std
::
min
(
max_blocks
,
pre
);
paddle
::
operators
::
AssignGradWithAxis
<
T
><<<
grid_size
,
block_size
,
64
*
4
,
dev_ctx
.
stream
()
>>>
(
out_grad_data
,
indices_data
,
x_grad_data
,
pre
,
post
,
n
,
1
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
kthvalue_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
KthvalueGradKernel
,
float
,
double
,
int
,
int64_t
)
{}
paddle/phi/kernels/gpu/kthvalue_kernel.cu
0 → 100644
浏览文件 @
d7ccd6bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/kthvalue_kernel.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
inline
int
getBlockSize
(
int
col
)
{
if
(
col
>
512
)
return
1024
;
else
if
(
col
>
256
&&
col
<=
512
)
return
512
;
else
if
(
col
>
128
&&
col
<=
256
)
return
256
;
else
if
(
col
>
64
&&
col
<=
128
)
return
128
;
else
return
64
;
}
template
<
typename
T
>
bool
SortKthvalue
(
const
phi
::
GPUContext
&
dev_ctx
,
const
DenseTensor
*
input_tensor
,
const
int64_t
num_cols
,
const
int64_t
num_rows
,
const
int
k
,
DenseTensor
*
out_tensor
,
DenseTensor
*
indices_tensor
)
{
auto
cu_stream
=
dev_ctx
.
stream
();
DenseTensor
input_indices
;
const
std
::
vector
<
int64_t
>
dims
=
{
num_rows
,
num_cols
};
auto
dim
=
phi
::
make_ddim
(
dims
);
input_indices
.
Resize
(
dim
);
dev_ctx
.
template
Alloc
<
int64_t
>(
&
input_indices
);
size_t
temp_storage_bytes
=
-
1
;
int
block_size
=
getBlockSize
(
num_cols
);
unsigned
int
maxGridDimX
=
dev_ctx
.
GetCUDAMaxGridDimSize
()[
0
];
unsigned
int
grid_size
=
num_rows
<
maxGridDimX
?
static_cast
<
unsigned
int
>
(
num_rows
)
:
maxGridDimX
;
paddle
::
operators
::
InitIndex
<
int64_t
><<<
grid_size
,
block_size
,
0
,
cu_stream
>>>
(
input_indices
.
data
<
int64_t
>
(),
num_rows
,
num_cols
);
cub
::
CountingInputIterator
<
int64_t
>
counting_iter
(
0
);
cub
::
TransformInputIterator
<
int64_t
,
paddle
::
operators
::
SegmentOffsetIter
,
cub
::
CountingInputIterator
<
int64_t
>>
segment_offsets_t
(
counting_iter
,
paddle
::
operators
::
SegmentOffsetIter
(
num_cols
));
T
*
sorted_values_ptr
;
int64_t
*
sorted_indices_ptr
;
DenseTensor
temp_values
,
temp_indices
;
const
T
*
input
=
input_tensor
->
data
<
T
>
();
T
*
values
=
out_tensor
->
data
<
T
>
();
int64_t
*
indices
=
indices_tensor
->
mutable_data
<
int64_t
>
(
dev_ctx
.
GetPlace
());
temp_values
.
Resize
(
dim
);
temp_indices
.
Resize
(
dim
);
sorted_values_ptr
=
dev_ctx
.
template
Alloc
<
T
>(
&
temp_values
);
sorted_indices_ptr
=
dev_ctx
.
template
Alloc
<
int64_t
>(
&
temp_indices
);
auto
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
nullptr
,
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
DenseTensor
temp_storage
;
temp_storage
.
Resize
({
static_cast
<
int
>
(
temp_storage_bytes
/
sizeof
(
uint8_t
))});
uint8_t
*
temp_storage_data
=
dev_ctx
.
template
Alloc
<
uint8_t
>(
&
temp_storage
);
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
temp_storage_data
,
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs, "
<<
temp_storage_bytes
<<
", status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"KthvalueOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs, "
<<
temp_storage_bytes
<<
", status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
auto
&
dev
=
*
dev_ctx
.
eigen_device
();
const
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
2
>
slice_indices
{
0
,
k
-
1
};
const
Eigen
::
DSizes
<
Eigen
::
DenseIndex
,
2
>
slice_sizes
{
num_rows
,
1
};
auto
e_indices
=
EigenMatrix
<
int64_t
>::
From
(
*
indices_tensor
,
dim
);
auto
e_tmp_indices
=
EigenMatrix
<
int64_t
>::
From
(
static_cast
<
const
DenseTensor
>
(
temp_indices
));
std
::
vector
<
int
>
odims
=
{
static_cast
<
int
>
(
num_rows
),
static_cast
<
int
>
(
1
)};
dim
=
phi
::
make_ddim
(
odims
);
auto
e_values
=
EigenMatrix
<
T
>::
From
(
*
out_tensor
,
dim
);
auto
e_tmp_values
=
EigenMatrix
<
T
>::
From
(
static_cast
<
const
DenseTensor
>
(
temp_values
));
funcs
::
EigenSlice
<
std
::
decay_t
<
decltype
(
dev
)
>
,
int64_t
,
2
>::
Eval
(
dev
,
e_indices
,
e_tmp_indices
,
slice_indices
,
slice_sizes
);
funcs
::
EigenSlice
<
std
::
decay_t
<
decltype
(
dev
)
>
,
T
,
2
>::
Eval
(
dev
,
e_values
,
e_tmp_values
,
slice_indices
,
slice_sizes
);
return
true
;
}
template
<
typename
T
,
typename
Context
>
void
KthvalueKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
int
k
,
int
axis
,
bool
keepdim
,
DenseTensor
*
output
,
DenseTensor
*
indices
)
{
const
auto
&
in_dims
=
x
.
dims
();
if
(
axis
<
0
)
axis
+=
in_dims
.
size
();
auto
out_dims
=
output
->
dims
();
const
T
*
input_data
=
x
.
data
<
T
>
();
T
*
output_data
=
dev_ctx
.
template
Alloc
<
T
>(
output
);
int64_t
*
indices_data
=
dev_ctx
.
template
Alloc
<
int64_t
>(
indices
);
if
(
axis
==
in_dims
.
size
()
-
1
)
{
const
int64_t
&
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
in_dims
,
0
,
in_dims
.
size
()
-
1
));
const
int64_t
&
input_width
=
in_dims
[
in_dims
.
size
()
-
1
];
PADDLE_ENFORCE_EQ
(
SortKthvalue
<
T
>
(
dev_ctx
,
&
x
,
input_width
,
input_height
,
k
,
output
,
indices
),
true
,
phi
::
errors
::
External
(
"KthvalueOP: Error when use cub sorting"
));
return
;
}
else
{
std
::
vector
<
int
>
trans
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
in_dims
.
size
()
-
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
()
-
1
;
i
++
)
{
trans
.
emplace_back
(
i
);
}
trans
.
emplace_back
(
axis
);
if
(
!
keepdim
)
{
std
::
vector
<
int
>
tmp_out_shape
;
for
(
int
i
=
0
;
i
<
axis
;
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
tmp_out_shape
.
emplace_back
(
1
);
for
(
int
i
=
axis
+
1
;
i
<
in_dims
.
size
();
i
++
)
{
tmp_out_shape
.
emplace_back
(
in_dims
[
i
]);
}
DDim
tmp_out_dims
=
phi
::
make_ddim
(
tmp_out_shape
);
output
->
Resize
(
tmp_out_dims
);
indices
->
Resize
(
tmp_out_dims
);
}
DDim
trans_dims
(
in_dims
);
DDim
trans_out_dims
(
in_dims
);
for
(
int
i
=
0
;
i
<
trans
.
size
();
i
++
)
{
trans_dims
[
i
]
=
in_dims
[
trans
[
i
]];
trans_out_dims
[
i
]
=
in_dims
[
trans
[
i
]];
}
trans_out_dims
[
in_dims
.
size
()
-
1
]
=
1
;
DenseTensor
trans_input
;
trans_input
.
mutable_data
<
T
>
(
trans_dims
,
dev_ctx
.
GetPlace
());
int
ndims
=
trans
.
size
();
funcs
::
TransCompute
<
phi
::
GPUContext
,
T
>
(
ndims
,
dev_ctx
,
x
,
&
trans_input
,
trans
);
DenseTensor
trans_ind
,
trans_out
;
trans_ind
.
mutable_data
<
int64_t
>
(
trans_out_dims
,
dev_ctx
.
GetPlace
());
trans_out
.
mutable_data
<
T
>
(
trans_out_dims
,
dev_ctx
.
GetPlace
());
const
int64_t
input_height
=
phi
::
product
(
phi
::
slice_ddim
(
trans_dims
,
0
,
trans_dims
.
size
()
-
1
));
const
int64_t
input_width
=
trans_dims
[
trans_dims
.
size
()
-
1
];
PADDLE_ENFORCE_EQ
(
SortKthvalue
<
T
>
(
dev_ctx
,
&
trans_input
,
input_width
,
input_height
,
k
,
&
trans_out
,
&
trans_ind
),
true
,
phi
::
errors
::
External
(
"KthvalueOP: Error when use cub sorting"
));
funcs
::
TransCompute
<
phi
::
GPUContext
,
int64_t
>
(
ndims
,
dev_ctx
,
trans_ind
,
indices
,
trans
);
funcs
::
TransCompute
<
phi
::
GPUContext
,
T
>
(
ndims
,
dev_ctx
,
trans_out
,
output
,
trans
);
if
(
!
keepdim
)
{
output
->
Resize
(
out_dims
);
indices
->
Resize
(
out_dims
);
}
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
kthvalue
,
GPU
,
ALL_LAYOUT
,
phi
::
KthvalueKernel
,
float
,
double
,
int
,
int64_t
)
{}
paddle/phi/kernels/kthvalue_grad_kernel.h
0 → 100644
浏览文件 @
d7ccd6bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
KthvalueGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
d_out
,
const
DenseTensor
&
x
,
const
DenseTensor
&
indices
,
int
k
,
int
axis
,
bool
keepdim
,
DenseTensor
*
d_x
);
}
// namespace phi
paddle/phi/kernels/kthvalue_kernel.h
0 → 100644
浏览文件 @
d7ccd6bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
KthvalueKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
int
k
,
int
axis
,
bool
keepdim
,
DenseTensor
*
out
,
DenseTensor
*
indices
);
}
// namespace phi
paddle/phi/ops/compat/kthvalue_sig.cc
0 → 100644
浏览文件 @
d7ccd6bf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace
phi
{
KernelSignature
KthvalueGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"kthvalue_grad"
,
{
GradVarName
(
"Out"
),
"X"
,
"Indices"
},
{
"k"
,
"axis"
,
"keepdim"
},
{
GradVarName
(
"X"
)});
}
}
// namespace phi
PD_REGISTER_ARG_MAPPING_FN
(
kthvalue_grad
,
phi
::
KthvalueGradOpArgumentMapping
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录