Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
74894cd7
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
74894cd7
编写于
3月 31, 2022
作者:
C
csy0225
提交者:
GitHub
3月 31, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix conflict (#40851)
上级
e559fe41
变更
44
隐藏空白更改
内联
并排
Showing
44 changed file
with
2641 addition
and
684 deletion
+2641
-684
paddle/fluid/operators/range_op.cc
paddle/fluid/operators/range_op.cc
+8
-49
paddle/fluid/operators/range_op.cu
paddle/fluid/operators/range_op.cu
+0
-61
paddle/fluid/operators/range_op_npu_test.cc
paddle/fluid/operators/range_op_npu_test.cc
+1
-1
paddle/fluid/operators/stack_op.cc
paddle/fluid/operators/stack_op.cc
+8
-62
paddle/fluid/operators/stack_op.cu
paddle/fluid/operators/stack_op.cu
+0
-207
paddle/fluid/operators/stack_op_npu.cc
paddle/fluid/operators/stack_op_npu.cc
+1
-1
paddle/fluid/operators/stack_op_xpu.cc
paddle/fluid/operators/stack_op_xpu.cc
+1
-1
paddle/fluid/operators/unique_op.cc
paddle/fluid/operators/unique_op.cc
+37
-76
paddle/fluid/operators/unstack_op.cc
paddle/fluid/operators/unstack_op.cc
+8
-51
paddle/fluid/operators/unstack_op.h
paddle/fluid/operators/unstack_op.h
+0
-174
paddle/fluid/operators/unstack_op_npu.cc
paddle/fluid/operators/unstack_op_npu.cc
+1
-1
paddle/phi/core/utils/data_type.h
paddle/phi/core/utils/data_type.h
+21
-0
paddle/phi/infermeta/multiary.cc
paddle/phi/infermeta/multiary.cc
+46
-0
paddle/phi/infermeta/multiary.h
paddle/phi/infermeta/multiary.h
+4
-0
paddle/phi/infermeta/ternary.cc
paddle/phi/infermeta/ternary.cc
+50
-0
paddle/phi/infermeta/ternary.h
paddle/phi/infermeta/ternary.h
+5
-0
paddle/phi/infermeta/unary.cc
paddle/phi/infermeta/unary.cc
+126
-0
paddle/phi/infermeta/unary.h
paddle/phi/infermeta/unary.h
+28
-0
paddle/phi/kernels/cpu/range_kernel.cc
paddle/phi/kernels/cpu/range_kernel.cc
+45
-0
paddle/phi/kernels/cpu/stack_grad_kernel.cc
paddle/phi/kernels/cpu/stack_grad_kernel.cc
+59
-0
paddle/phi/kernels/cpu/stack_kernel.cc
paddle/phi/kernels/cpu/stack_kernel.cc
+62
-0
paddle/phi/kernels/cpu/unique_kernel.cc
paddle/phi/kernels/cpu/unique_kernel.cc
+131
-0
paddle/phi/kernels/cpu/unstack_grad_kernel.cc
paddle/phi/kernels/cpu/unstack_grad_kernel.cc
+27
-0
paddle/phi/kernels/cpu/unstack_kernel.cc
paddle/phi/kernels/cpu/unstack_kernel.cc
+22
-0
paddle/phi/kernels/funcs/range_function.h
paddle/phi/kernels/funcs/range_function.h
+49
-0
paddle/phi/kernels/funcs/stack_functor.h
paddle/phi/kernels/funcs/stack_functor.h
+83
-0
paddle/phi/kernels/funcs/unique_functor.h
paddle/phi/kernels/funcs/unique_functor.h
+426
-0
paddle/phi/kernels/gpu/range_kernel.cu
paddle/phi/kernels/gpu/range_kernel.cu
+57
-0
paddle/phi/kernels/gpu/stack_grad_kernel.cu
paddle/phi/kernels/gpu/stack_grad_kernel.cu
+143
-0
paddle/phi/kernels/gpu/stack_kernel.cu
paddle/phi/kernels/gpu/stack_kernel.cu
+113
-0
paddle/phi/kernels/gpu/unique_kernel.cu
paddle/phi/kernels/gpu/unique_kernel.cu
+615
-0
paddle/phi/kernels/gpu/unstack_grad_kernel.cu
paddle/phi/kernels/gpu/unstack_grad_kernel.cu
+29
-0
paddle/phi/kernels/gpu/unstack_kernel.cu
paddle/phi/kernels/gpu/unstack_kernel.cu
+29
-0
paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+70
-0
paddle/phi/kernels/impl/unstack_kernel_impl.h
paddle/phi/kernels/impl/unstack_kernel_impl.h
+62
-0
paddle/phi/kernels/range_kernel.h
paddle/phi/kernels/range_kernel.h
+28
-0
paddle/phi/kernels/stack_grad_kernel.h
paddle/phi/kernels/stack_grad_kernel.h
+27
-0
paddle/phi/kernels/stack_kernel.h
paddle/phi/kernels/stack_kernel.h
+27
-0
paddle/phi/kernels/unique_kernel.h
paddle/phi/kernels/unique_kernel.h
+48
-0
paddle/phi/kernels/unstack_grad_kernel.h
paddle/phi/kernels/unstack_grad_kernel.h
+27
-0
paddle/phi/kernels/unstack_kernel.h
paddle/phi/kernels/unstack_kernel.h
+28
-0
paddle/phi/ops/compat/stack_sig.cc
paddle/phi/ops/compat/stack_sig.cc
+23
-0
paddle/phi/ops/compat/unique_sig.cc
paddle/phi/ops/compat/unique_sig.cc
+42
-0
paddle/phi/ops/compat/unstack_sig.cc
paddle/phi/ops/compat/unstack_sig.cc
+24
-0
未找到文件。
paddle/fluid/operators/range_op.cc
浏览文件 @
74894cd7
...
@@ -14,6 +14,10 @@ limitations under the License. */
...
@@ -14,6 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/range_op.h"
#include "paddle/fluid/operators/range_op.h"
#include <string>
#include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/ternary.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel {
...
@@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel {
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
if
(
ctx
->
HasInput
(
"Start"
))
{
auto
s_dims
=
ctx
->
GetInputDim
(
"Start"
);
PADDLE_ENFORCE_EQ
(
s_dims
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The dim of the shape of Input(Start) should be 1, but got %d"
,
s_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
s_dims
[
0
],
1
,
platform
::
errors
::
InvalidArgument
(
"The first dim of the shape of Input(Start) should "
"be 1, but got %d"
,
s_dims
[
0
]));
}
if
(
ctx
->
HasInput
(
"End"
))
{
auto
e_dims
=
ctx
->
GetInputDim
(
"End"
);
PADDLE_ENFORCE_EQ
(
e_dims
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The dim of the shape of Input(End) should be 1, but got %d"
,
e_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
e_dims
[
0
],
1
,
platform
::
errors
::
InvalidArgument
(
"The first dim of the shape of "
"Input(End) should be 1, but got %d"
,
e_dims
[
0
]));
}
if
(
ctx
->
HasInput
(
"Step"
))
{
auto
step_dims
=
ctx
->
GetInputDim
(
"Step"
);
PADDLE_ENFORCE_EQ
(
step_dims
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The dim of the shape of Input(Step) should be 1, but got %d"
,
step_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
step_dims
[
0
],
1
,
platform
::
errors
::
InvalidArgument
(
"The first dim of the shape of Input(Step) should "
"be 1, but got %d"
,
step_dims
[
0
]));
}
ctx
->
SetOutputDim
(
"Out"
,
{
-
1
});
}
protected:
protected:
framework
::
OpKernelType
GetKernelTypeForVar
(
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
...
@@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_WITHOUT_GRADIENT
(
range
,
ops
::
RangeOp
,
ops
::
RangeOpMaker
);
DECLARE_INFER_SHAPE_FUNCTOR
(
range
,
RangeInferMetaFunctor
,
REGISTER_OP_CPU_KERNEL
(
range
,
ops
::
CPURangeKernel
<
int
>
,
PD_INFER_META
(
phi
::
RangeInferMeta
));
ops
::
CPURangeKernel
<
float
>
,
ops
::
CPURangeKernel
<
double
>
,
REGISTER_OP_WITHOUT_GRADIENT
(
range
,
ops
::
RangeOp
,
ops
::
RangeOpMaker
,
ops
::
CPURangeKernel
<
int64_t
>
);
RangeInferMetaFunctor
);
paddle/fluid/operators/range_op.cu
已删除
100644 → 0
浏览文件 @
e559fe41
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/range_op.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
RangeKernel
(
T
start
,
T
step
,
int64_t
size
,
T
*
out
)
{
CUDA_KERNEL_LOOP
(
index
,
size
)
{
out
[
index
]
=
start
+
step
*
index
;
}
}
template
<
typename
T
>
class
CUDARangeKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
start_t
=
context
.
Input
<
framework
::
Tensor
>
(
"Start"
);
auto
*
end_t
=
context
.
Input
<
framework
::
Tensor
>
(
"End"
);
auto
*
step_t
=
context
.
Input
<
framework
::
Tensor
>
(
"Step"
);
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
T
start
=
GetValue
<
T
>
(
start_t
);
T
end
=
GetValue
<
T
>
(
end_t
);
T
step
=
GetValue
<
T
>
(
step_t
);
int64_t
size
=
0
;
GetSize
(
start
,
end
,
step
,
&
size
);
out
->
Resize
(
phi
::
make_ddim
({
size
}));
T
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
stream
=
context
.
cuda_device_context
().
stream
();
int
block
=
std
::
min
(
size
,
static_cast
<
int64_t
>
(
256
));
int
grid
=
(
size
+
block
-
1
)
/
block
;
RangeKernel
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
start
,
step
,
size
,
out_data
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
range
,
ops
::
CUDARangeKernel
<
int
>
,
ops
::
CUDARangeKernel
<
int64_t
>
,
ops
::
CUDARangeKernel
<
float
>
,
ops
::
CUDARangeKernel
<
double
>
);
paddle/fluid/operators/range_op_npu_test.cc
浏览文件 @
74894cd7
...
@@ -30,7 +30,7 @@ limitations under the License. */
...
@@ -30,7 +30,7 @@ limitations under the License. */
namespace
f
=
paddle
::
framework
;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
range
);
USE_OP
_ITSELF
(
range
);
USE_OP_DEVICE_KERNEL
(
range
,
NPU
);
USE_OP_DEVICE_KERNEL
(
range
,
NPU
);
template
<
typename
T
>
template
<
typename
T
>
...
...
paddle/fluid/operators/stack_op.cc
浏览文件 @
74894cd7
...
@@ -12,9 +12,12 @@
...
@@ -12,9 +12,12 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/stack_op.h"
#include <memory>
#include <memory>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace
plat
=
paddle
::
platform
;
namespace
plat
=
paddle
::
platform
;
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
...
@@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel {
...
@@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel {
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_GT
(
ctx
->
Inputs
(
"X"
).
size
(),
0
,
platform
::
errors
::
InvalidArgument
(
"Number of Inputs(X) must be larger than 0, but"
" received value is:%d."
,
ctx
->
Inputs
(
"X"
).
size
()));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Y"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Y) of stack_op should not be null."
));
auto
input_dims
=
ctx
->
GetInputsDim
(
"X"
);
for
(
size_t
i
=
1
;
i
<
input_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
input_dims
[
i
],
input_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"Dims of all Inputs(X) must be the same, but"
" received input %d dim is:%d not equal to input 0"
" dim:%d."
,
i
,
input_dims
[
i
],
input_dims
[
0
]));
}
// Only lod of X[0] would be shared with Y
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Y"
);
int
axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
);
int
rank
=
input_dims
[
0
].
size
();
PADDLE_ENFORCE_GE
(
axis
,
-
(
rank
+
1
),
platform
::
errors
::
InvalidArgument
(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d."
,
rank
,
axis
));
PADDLE_ENFORCE_LT
(
axis
,
rank
+
1
,
platform
::
errors
::
InvalidArgument
(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d"
,
rank
,
axis
));
if
(
axis
<
0
)
axis
+=
(
rank
+
1
);
auto
vec
=
phi
::
vectorize
<
int
>
(
input_dims
[
0
]);
vec
.
insert
(
vec
.
begin
()
+
axis
,
input_dims
.
size
());
ctx
->
SetOutputDim
(
"Y"
,
phi
::
make_ddim
(
vec
));
}
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
auto
input_data_type
=
...
@@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker<T> {
...
@@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker<T> {
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR
(
stack
,
StackInferMetaFunctor
,
PD_INFER_META
(
phi
::
StackInferMeta
));
REGISTER_OPERATOR
(
stack
,
ops
::
StackOp
,
ops
::
StackOpMaker
,
REGISTER_OPERATOR
(
stack
,
ops
::
StackOp
,
ops
::
StackOpMaker
,
ops
::
StackGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
StackGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
StackGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
ops
::
StackGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
StackInferMetaFunctor
);
REGISTER_OPERATOR
(
stack_grad
,
ops
::
StackOpGrad
);
REGISTER_OPERATOR
(
stack_grad
,
ops
::
StackOpGrad
);
REGISTER_OP_CPU_KERNEL
(
stack
,
ops
::
StackKernel
<
plat
::
CPUDeviceContext
,
float
>
,
ops
::
StackKernel
<
plat
::
CPUDeviceContext
,
double
>
,
ops
::
StackKernel
<
plat
::
CPUDeviceContext
,
int
>
,
ops
::
StackKernel
<
plat
::
CPUDeviceContext
,
int64_t
>
,
ops
::
StackKernel
<
plat
::
CPUDeviceContext
,
paddle
::
platform
::
bfloat16
>
);
REGISTER_OP_CPU_KERNEL
(
stack_grad
,
ops
::
StackGradKernel
<
plat
::
CPUDeviceContext
,
float
>
,
ops
::
StackGradKernel
<
plat
::
CPUDeviceContext
,
double
>
,
ops
::
StackGradKernel
<
plat
::
CPUDeviceContext
,
int
>
,
ops
::
StackGradKernel
<
plat
::
CPUDeviceContext
,
int64_t
>
,
ops
::
StackGradKernel
<
plat
::
CPUDeviceContext
,
paddle
::
platform
::
bfloat16
>
);
paddle/fluid/operators/stack_op.cu
已删除
100644 → 0
浏览文件 @
e559fe41
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <limits>
#include <vector>
#include "paddle/fluid/operators/stack_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace
plat
=
paddle
::
platform
;
namespace
ops
=
paddle
::
operators
;
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
typename
IntType
>
__global__
void
StackCUDAKernel
(
T
**
input_ptrs
,
int
split_size
,
int
rows
,
int
cols
,
T
*
__restrict__
output
)
{
IntType
grid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
grid_x
<
cols
;
grid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
IntType
grid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
IntType
split
=
grid_x
/
split_size
;
const
T
*
input_ptr
=
input_ptrs
[
split
];
IntType
col_offset
=
grid_x
%
split_size
;
#pragma unroll
for
(;
grid_y
<
rows
;
grid_y
+=
blockDim
.
y
*
gridDim
.
y
)
{
output
[
grid_y
*
cols
+
grid_x
]
=
input_ptr
[
grid_y
*
split_size
+
col_offset
];
}
}
}
template
<
typename
T
>
class
StackGPUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
Tensor
=
framework
::
LoDTensor
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
x
=
ctx
.
MultiInput
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
n
=
static_cast
<
int
>
(
x
.
size
());
auto
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
const
T
*>
x_datas
(
n
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
x_datas
[
i
]
=
x
[
i
]
->
data
<
T
>
();
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
plat
::
CUDADeviceContext
>();
auto
tmp_x_data
=
memory
::
Alloc
(
dev_ctx
,
x_datas
.
size
()
*
sizeof
(
T
*
));
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_x_data
->
ptr
(),
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
x_datas
.
data
()),
x_datas
.
size
()
*
sizeof
(
T
*
),
dev_ctx
.
stream
());
// Split x dim from axis to matrix
int
x_row
=
1
,
x_col
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
x_row
*=
x
[
0
]
->
dims
()[
i
];
}
x_col
=
x
[
0
]
->
numel
()
/
x_row
;
int
out_col
=
x_col
*
n
;
auto
config
=
GetGpuLaunchConfig2D
(
dev_ctx
,
out_col
,
x_row
);
if
(
y
->
numel
()
<
std
::
numeric_limits
<
int32_t
>::
max
())
{
StackCUDAKernel
<
T
,
int32_t
><<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
x_col
,
x_row
,
out_col
,
y_data
);
}
else
{
StackCUDAKernel
<
T
,
int64_t
><<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
x_col
,
x_row
,
out_col
,
y_data
);
}
}
};
template
<
typename
T
,
typename
IntType
>
__global__
void
UnStackHelperCUDAKernel
(
const
T
*
__restrict__
input
,
int
pre_dim_size
,
int
split_dim_size
,
int
suf_dim_size
,
int
num_split
,
T
**
output_ptrs
)
{
assert
(
blockDim
.
y
==
1
);
assert
(
blockDim
.
z
==
1
);
// In this case they are equal
assert
(
split_dim_size
%
num_split
==
0
);
IntType
size
=
pre_dim_size
*
split_dim_size
*
suf_dim_size
;
IntType
each_dim_size
=
split_dim_size
/
num_split
;
for
(
IntType
offset
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
offset
<
size
;
offset
+=
blockDim
.
x
*
gridDim
.
x
)
{
IntType
i
=
offset
/
(
split_dim_size
*
suf_dim_size
);
IntType
j
=
(
offset
%
(
split_dim_size
*
suf_dim_size
))
/
suf_dim_size
;
IntType
k
=
offset
%
suf_dim_size
;
T
*
output
=
output_ptrs
[
j
/
each_dim_size
];
if
(
output
==
nullptr
)
{
return
;
}
IntType
output_ind
=
i
*
each_dim_size
*
suf_dim_size
+
(
j
%
each_dim_size
)
*
suf_dim_size
+
k
;
*
(
output
+
output_ind
)
=
input
[
offset
];
}
}
template
<
typename
T
>
class
StackGradGPUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
Tensor
=
framework
::
LoDTensor
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
dy
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
dx
=
ctx
.
MultiOutput
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
dy
->
dims
().
size
();
int
n
=
dy
->
dims
()[
axis
];
PADDLE_ENFORCE_EQ
(
n
,
dx
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Output dx size should be equal to n, but"
" received n is:%d dx size is:%d."
,
n
,
dx
.
size
()));
// dx is output, so save each data address, then copy each dy into dx_data
std
::
vector
<
T
*>
outputs
(
n
);
auto
out_var_names
=
ctx
.
OutputNames
(
framework
::
GradVarName
(
"X"
));
for
(
size_t
j
=
0
;
j
<
dx
.
size
();
++
j
)
{
if
(
dx
[
j
]
==
nullptr
)
{
outputs
[
j
]
=
nullptr
;
}
if
(
out_var_names
[
j
]
!=
framework
::
kEmptyVarName
&&
dx
[
j
]
->
numel
()
!=
0UL
)
{
T
*
ptr
=
dx
[
j
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
outputs
[
j
]
=
ptr
;
}
else
{
outputs
[
j
]
=
nullptr
;
}
}
auto
dy_data
=
dy
->
data
<
T
>
();
// each dx should have same shape
int
dy_pre
=
1
,
dy_suf
=
1
;
auto
dy_dims
=
dy
->
dims
();
int
split_dim
=
n
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
dy_pre
*=
dy_dims
[
i
];
}
dy_suf
=
dy
->
numel
()
/
(
split_dim
*
dy_pre
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
plat
::
CUDADeviceContext
>();
auto
tmp_out_data
=
memory
::
Alloc
(
dev_ctx
,
outputs
.
size
()
*
sizeof
(
T
*
));
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_out_data
->
ptr
(),
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
outputs
.
data
()),
outputs
.
size
()
*
sizeof
(
T
*
),
dev_ctx
.
stream
());
auto
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
dy_pre
*
split_dim
*
dy_suf
);
if
(
dy
->
numel
()
<
std
::
numeric_limits
<
int32_t
>::
max
())
{
UnStackHelperCUDAKernel
<
T
,
int32_t
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
dy_pre
,
split_dim
,
dy_suf
,
split_dim
,
reinterpret_cast
<
T
**>
(
tmp_out_data
->
ptr
()));
}
else
{
UnStackHelperCUDAKernel
<
T
,
int64_t
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
dy_pre
,
split_dim
,
dy_suf
,
split_dim
,
reinterpret_cast
<
T
**>
(
tmp_out_data
->
ptr
()));
}
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
stack
,
ops
::
StackGPUKernel
<
float
>
,
ops
::
StackGPUKernel
<
double
>
,
ops
::
StackGPUKernel
<
int
>
,
ops
::
StackGPUKernel
<
int64_t
>
,
ops
::
StackGPUKernel
<
plat
::
float16
>
,
ops
::
StackGPUKernel
<
plat
::
bfloat16
>
);
REGISTER_OP_CUDA_KERNEL
(
stack_grad
,
ops
::
StackGradGPUKernel
<
float
>
,
ops
::
StackGradGPUKernel
<
double
>
,
ops
::
StackGradGPUKernel
<
int
>
,
ops
::
StackGradGPUKernel
<
int64_t
>
,
ops
::
StackGradGPUKernel
<
plat
::
float16
>
,
ops
::
StackGradGPUKernel
<
plat
::
bfloat16
>
);
paddle/fluid/operators/stack_op_npu.cc
浏览文件 @
74894cd7
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/
operators/stack_op
.h"
#include "paddle/fluid/
framework/op_registry
.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/operators/stack_op_xpu.cc
浏览文件 @
74894cd7
...
@@ -13,9 +13,9 @@
...
@@ -13,9 +13,9 @@
// limitations under the License.
// limitations under the License.
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/stack_op.h"
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/concat_op.h"
#include "paddle/fluid/operators/concat_op.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
...
...
paddle/fluid/operators/unique_op.cc
浏览文件 @
74894cd7
...
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/unique_op.h"
#include "paddle/fluid/operators/unique_op.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel {
...
@@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"unique"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"unique"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"unique"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"unique"
);
auto
in_dims
=
ctx
->
GetInputDim
(
"X"
);
if
(
!
ctx
->
Attrs
().
Get
<
bool
>
(
"is_sorted"
))
{
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Index"
),
"Output"
,
"Index"
,
"unique"
);
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The Input(X) should be 1-D Tensor, "
"But now the dims of Input(X) is %d."
,
in_dims
.
size
()));
ctx
->
SetOutputDim
(
"Out"
,
{
-
1
});
ctx
->
SetOutputDim
(
"Index"
,
in_dims
);
return
;
}
bool
return_index
=
ctx
->
Attrs
().
Get
<
bool
>
(
"return_index"
);
bool
return_index
=
ctx
->
Attrs
().
Get
<
bool
>
(
"return_index"
);
bool
return_inverse
=
ctx
->
Attrs
().
Get
<
bool
>
(
"return_inverse"
);
bool
return_inverse
=
ctx
->
Attrs
().
Get
<
bool
>
(
"return_inverse"
);
bool
return_counts
=
ctx
->
Attrs
().
Get
<
bool
>
(
"return_counts"
);
bool
return_counts
=
ctx
->
Attrs
().
Get
<
bool
>
(
"return_counts"
);
auto
axis_vec
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"axis"
);
auto
axis_vec
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"axis"
);
auto
data_type
=
static_cast
<
phi
::
DataType
>
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"dtype"
)));
// Construct MetaTensor for InferMeta Func
using
CompatMetaTensor
=
framework
::
CompatMetaTensor
;
CompatMetaTensor
x
(
ctx
->
GetInputVarPtrs
(
"X"
)[
0
],
ctx
->
IsRuntime
());
CompatMetaTensor
out
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
],
ctx
->
IsRuntime
());
std
::
unique_ptr
<
CompatMetaTensor
>
indices
(
nullptr
);
std
::
unique_ptr
<
CompatMetaTensor
>
index
(
nullptr
);
std
::
unique_ptr
<
CompatMetaTensor
>
counts
(
nullptr
);
if
(
return_index
)
{
if
(
return_index
)
{
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Indices"
),
"Output"
,
"Indices"
,
"unique"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Indices"
),
"Output"
,
"Indices"
,
"unique"
);
indices
=
std
::
move
(
std
::
unique_ptr
<
CompatMetaTensor
>
(
new
CompatMetaTensor
(
ctx
->
GetOutputVarPtrs
(
"Indices"
)[
0
],
ctx
->
IsRuntime
())));
}
}
if
(
return_inverse
)
{
if
(
return_inverse
)
{
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Index"
),
"Output"
,
"Index"
,
"unique"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Index"
),
"Output"
,
"Index"
,
"unique"
);
index
=
std
::
move
(
std
::
unique_ptr
<
CompatMetaTensor
>
(
new
CompatMetaTensor
(
ctx
->
GetOutputVarPtrs
(
"Index"
)[
0
],
ctx
->
IsRuntime
())));
}
}
if
(
return_counts
)
{
if
(
return_counts
)
{
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Counts"
),
"Output"
,
"Counts"
,
"unique"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Counts"
),
"Output"
,
"Counts"
,
"unique"
);
counts
=
std
::
move
(
std
::
unique_ptr
<
CompatMetaTensor
>
(
new
CompatMetaTensor
(
ctx
->
GetOutputVarPtrs
(
"Counts"
)[
0
],
ctx
->
IsRuntime
())));
}
}
bool
is_sorted
=
ctx
->
Attrs
().
Get
<
bool
>
(
"is_sorted"
);
if
(
axis_vec
.
empty
())
{
if
(
is_sorted
)
{
ctx
->
SetOutputDim
(
"Out"
,
{
-
1
});
phi
::
UniqueInferMeta
(
x
,
return_index
,
return_inverse
,
return_counts
,
if
(
return_inverse
)
{
axis_vec
,
data_type
,
&
out
,
indices
.
get
(),
ctx
->
SetOutputDim
(
"Index"
,
{
phi
::
product
(
in_dims
)});
index
.
get
(),
counts
.
get
());
}
}
else
{
}
else
{
int
axis
=
axis_vec
[
0
];
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Index"
),
"Output"
,
"Index"
,
"unique"
);
if
(
axis
<
0
)
{
if
(
index
==
nullptr
)
{
axis
+=
in_dims
.
size
();
index
=
}
std
::
move
(
std
::
unique_ptr
<
CompatMetaTensor
>
(
new
CompatMetaTensor
(
PADDLE_ENFORCE_LT
(
ctx
->
GetOutputVarPtrs
(
"Index"
)[
0
],
ctx
->
IsRuntime
())));
axis
,
in_dims
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The axis(%d) should be less than "
"the dimension size(%d) of x."
,
axis
,
in_dims
.
size
()));
auto
out_dims
=
in_dims
;
out_dims
[
axis
]
=
-
1
;
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
if
(
return_inverse
)
{
ctx
->
SetOutputDim
(
"Index"
,
{
in_dims
[
axis
]});
}
}
}
phi
::
UniqueRawInferMeta
(
x
,
return_index
,
return_inverse
,
return_counts
,
if
(
return_index
)
{
axis_vec
,
data_type
,
is_sorted
,
&
out
,
ctx
->
SetOutputDim
(
"Indices"
,
{
-
1
});
indices
.
get
(),
index
.
get
(),
counts
.
get
());
}
if
(
return_counts
)
{
ctx
->
SetOutputDim
(
"Counts"
,
{
-
1
});
}
}
}
}
...
@@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_WITHOUT_GRADIENT
(
unique
,
ops
::
UniqueOp
,
ops
::
UniqueOpMaker
);
REGISTER_OP_WITHOUT_GRADIENT
(
unique
,
ops
::
UniqueOp
,
ops
::
UniqueOpMaker
);
REGISTER_OP_CPU_KERNEL
(
unique
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int32_t
>
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_VERSION
(
unique
)
.
AddCheckpoint
(
R"ROC(
Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute
[return_index, return_inverse, return_counts, axis, is_sorted].
)ROC"
,
paddle
::
framework
::
compatible
::
OpVersionDesc
()
.
NewOutput
(
"Indices"
,
"The indices of the input tensor that result in the "
"unique tensor."
)
.
NewOutput
(
"Counts"
,
"The counts for each unique element."
)
.
NewAttr
(
"return_index"
,
"If True, also return the indices of the input"
" tensor that result in the unique Tensor."
,
false
)
.
NewAttr
(
"return_inverse"
,
"If True, also return the indices for where elements"
" in the original input ended up in the returned unique "
"tensor."
,
false
)
.
NewAttr
(
"return_counts"
,
"If True, also return the counts for each unique element."
,
false
)
.
NewAttr
(
"axis"
,
"The axis to apply unique. If None, the input will be "
"flattened."
,
std
::
vector
<
int
>
{})
.
NewAttr
(
"is_sorted"
,
"If True, the unique elements of X are in ascending order."
"Otherwise, the unique elements are not sorted."
,
false
));
paddle/fluid/operators/unstack_op.cc
浏览文件 @
74894cd7
...
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/unstack_op.h"
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -25,43 +27,6 @@ namespace operators {
...
@@ -25,43 +27,6 @@ namespace operators {
class
UnStackOp
:
public
framework
::
OperatorWithKernel
{
class
UnStackOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"UnStack"
);
int
axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
);
int
num
=
ctx
->
Attrs
().
Get
<
int
>
(
"num"
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
int
rank
=
x_dim
.
size
();
PADDLE_ENFORCE_GE
(
axis
,
-
rank
,
platform
::
errors
::
InvalidArgument
(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d"
,
rank
));
PADDLE_ENFORCE_LT
(
axis
,
rank
,
platform
::
errors
::
InvalidArgument
(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d"
,
rank
));
if
(
axis
<
0
)
axis
+=
rank
;
PADDLE_ENFORCE_EQ
(
ctx
->
Outputs
(
"Y"
).
size
(),
static_cast
<
size_t
>
(
num
),
platform
::
errors
::
InvalidArgument
(
"Number of Outputs(Y) is wrong. Got %d , but it must "
"equal to attribute num which is %d."
,
ctx
->
Outputs
(
"Y"
).
size
(),
static_cast
<
size_t
>
(
num
)));
if
(
x_dim
[
axis
]
>
0
)
{
PADDLE_ENFORCE_EQ
(
num
,
x_dim
[
axis
],
platform
::
errors
::
InvalidArgument
(
"The number of attribute num is not equal to the length of the "
"%d axis of Input(X). Expect %d but got %d."
,
axis
,
x_dim
[
axis
],
num
));
}
auto
vec
=
phi
::
vectorize
<
int
>
(
x_dim
);
vec
.
erase
(
vec
.
begin
()
+
axis
);
ctx
->
SetOutputsDim
(
"Y"
,
std
::
vector
<
framework
::
DDim
>
(
// NOLINT
x_dim
[
axis
],
phi
::
make_ddim
(
vec
)));
}
};
};
class
UnStackOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
UnStackOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
@@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel {
...
@@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel {
namespace
plat
=
paddle
::
platform
;
namespace
plat
=
paddle
::
platform
;
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
DECLARE_INFER_SHAPE_FUNCTOR
(
unstack
,
UnStackInferMetaFunctor
,
PD_INFER_META
(
phi
::
UnStackInferMeta
));
REGISTER_OPERATOR
(
unstack
,
ops
::
UnStackOp
,
ops
::
UnStackOpMaker
,
REGISTER_OPERATOR
(
unstack
,
ops
::
UnStackOp
,
ops
::
UnStackOpMaker
,
ops
::
UnStackGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
UnStackGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
UnStackGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
ops
::
UnStackGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
UnStackInferMetaFunctor
);
REGISTER_OPERATOR
(
unstack_grad
,
ops
::
UnStackGradOp
);
REGISTER_OPERATOR
(
unstack_grad
,
ops
::
UnStackGradOp
);
REGISTER_OP_CPU_KERNEL
(
unstack
,
ops
::
UnStackKernel
<
plat
::
CPUDeviceContext
,
float
>
,
ops
::
UnStackKernel
<
plat
::
CPUDeviceContext
,
double
>
,
ops
::
UnStackKernel
<
plat
::
CPUDeviceContext
,
int
>
,
ops
::
UnStackKernel
<
plat
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
unstack_grad
,
ops
::
UnStackGradKernel
<
plat
::
CPUDeviceContext
,
float
>
,
ops
::
UnStackGradKernel
<
plat
::
CPUDeviceContext
,
double
>
,
ops
::
UnStackGradKernel
<
plat
::
CPUDeviceContext
,
int
>
,
ops
::
UnStackGradKernel
<
plat
::
CPUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/unstack_op.h
已删除
100644 → 0
浏览文件 @
e559fe41
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
VecXType
,
typename
T
>
struct
StackFunctor
{
HOSTDEVICE
StackFunctor
(
const
VecXType
&
x
,
T
*
y
,
int
n
,
int
post
)
:
x_
(
x
),
y_
(
y
),
n_
(
n
),
post_
(
post
)
{}
HOSTDEVICE
void
operator
()(
int
idx
)
{
int
i
=
idx
/
(
n_
*
post_
);
int
which_x
=
idx
/
post_
-
i
*
n_
;
int
x_index
=
i
*
post_
+
idx
%
post_
;
y_
[
idx
]
=
x_
[
which_x
][
x_index
];
}
private:
VecXType
x_
;
T
*
y_
;
int
n_
;
int
post_
;
};
template
<
typename
VecDxType
,
typename
T
>
struct
StackGradFunctor
{
HOSTDEVICE
StackGradFunctor
(
const
VecDxType
&
dx
,
const
T
*
dy
,
int
n
,
int
post
)
:
dx_
(
dx
),
dy_
(
dy
),
n_
(
n
),
post_
(
post
)
{}
HOSTDEVICE
void
operator
()(
int
idx
)
{
int
i
=
idx
/
(
n_
*
post_
);
int
which_x
=
idx
/
post_
-
i
*
n_
;
int
x_index
=
i
*
post_
+
idx
%
post_
;
dx_
[
which_x
][
x_index
]
=
dy_
[
idx
];
}
private:
VecDxType
dx_
;
const
T
*
dy_
;
int
n_
;
int
post_
;
};
template
<
typename
DeviceContext
,
typename
VecXType
,
typename
T
>
static
inline
void
StackFunctorForRange
(
const
DeviceContext
&
ctx
,
const
VecXType
&
x
,
T
*
y
,
int
total_num
,
int
n
,
int
post
)
{
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
total_num
);
for_range
(
StackFunctor
<
VecXType
,
T
>
(
x
,
y
,
n
,
post
));
}
template
<
typename
DeviceContext
,
typename
VecDxType
,
typename
T
>
static
inline
void
StackGradFunctorForRange
(
const
DeviceContext
&
ctx
,
const
VecDxType
&
dx
,
const
T
*
dy
,
int
total_num
,
int
n
,
int
post
)
{
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
total_num
);
for_range
(
StackGradFunctor
<
VecDxType
,
T
>
(
dx
,
dy
,
n
,
post
));
}
template
<
typename
DeviceContext
,
typename
T
>
class
UnStackGradKernel
:
public
framework
::
OpKernel
<
T
>
{
using
Tensor
=
framework
::
LoDTensor
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
x
=
ctx
.
MultiInput
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
n
=
static_cast
<
int
>
(
x
.
size
());
auto
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
const
T
*>
x_datas
(
n
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
x_datas
[
i
]
=
x
[
i
]
->
data
<
T
>
();
int
pre
=
1
;
int
post
=
1
;
auto
&
dim
=
x
[
0
]
->
dims
();
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
#if defined(__NVCC__) || defined(__HIPCC__)
int
total_num
=
pre
*
n
*
post
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
thrust
::
device_vector
<
const
T
*>
device_x_vec
(
x_datas
);
auto
x_data_arr
=
device_x_vec
.
data
().
get
();
StackFunctorForRange
(
dev_ctx
,
x_data_arr
,
y_data
,
total_num
,
n
,
post
);
// Wait() must be called because device_x_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
#else
auto
x_data_arr
=
x_datas
.
data
();
size_t
x_offset
=
0
;
size_t
y_offset
=
0
;
for
(
int
i
=
0
;
i
<
pre
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
std
::
memcpy
(
y_data
+
y_offset
,
x_data_arr
[
j
]
+
x_offset
,
post
*
sizeof
(
T
));
y_offset
+=
post
;
}
x_offset
+=
post
;
}
#endif
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
UnStackKernel
:
public
framework
::
OpKernel
<
T
>
{
using
Tensor
=
framework
::
LoDTensor
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
dy
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
dx
=
ctx
.
MultiOutput
<
Tensor
>
(
"Y"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
dy
->
dims
().
size
();
int
n
=
dy
->
dims
()[
axis
];
std
::
vector
<
T
*>
dx_datas
(
n
);
// NOLINT
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
dx_datas
[
i
]
=
dx
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
auto
dy_data
=
dy
->
data
<
T
>
();
if
(
dy
->
numel
()
==
0
)
return
;
int
pre
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dy
->
dims
()[
i
];
int
total_num
=
dy
->
numel
();
int
post
=
total_num
/
(
n
*
pre
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
#if defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
T
*>
device_dx_vec
(
dx_datas
);
auto
dx_data_arr
=
device_dx_vec
.
data
().
get
();
#else
auto
dx_data_arr
=
dx_datas
.
data
();
#endif
StackGradFunctorForRange
(
dev_ctx
,
dx_data_arr
,
dy_data
,
total_num
,
n
,
post
);
#if defined(__NVCC__) || defined(__HIPCC__)
// Wait() must be called because device_dx_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
#endif
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/unstack_op_npu.cc
浏览文件 @
74894cd7
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/
operators/unstack_op
.h"
#include "paddle/fluid/
framework/op_registry
.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/phi/core/utils/data_type.h
浏览文件 @
74894cd7
...
@@ -44,6 +44,10 @@ namespace phi {
...
@@ -44,6 +44,10 @@ namespace phi {
_PhiForEachDataTypeHelper_( \
_PhiForEachDataTypeHelper_( \
callback, ::phi::dtype::complex<double>, DataType::COMPLEX128);
callback, ::phi::dtype::complex<double>, DataType::COMPLEX128);
#define _PhiForEachDataTypeTiny_(callback) \
_PhiForEachDataTypeHelper_(callback, int, DataType::INT32); \
_PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64);
template
<
typename
Visitor
>
template
<
typename
Visitor
>
inline
void
VisitDataType
(
phi
::
DataType
type
,
Visitor
visitor
)
{
inline
void
VisitDataType
(
phi
::
DataType
type
,
Visitor
visitor
)
{
#define PhiVisitDataTypeCallback(cpp_type, data_type) \
#define PhiVisitDataTypeCallback(cpp_type, data_type) \
...
@@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) {
...
@@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) {
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"Not supported phi::DataType(%d) as data type."
,
static_cast
<
int
>
(
type
)));
"Not supported phi::DataType(%d) as data type."
,
static_cast
<
int
>
(
type
)));
}
}
template
<
typename
Visitor
>
inline
void
VisitDataTypeTiny
(
phi
::
DataType
type
,
Visitor
visitor
)
{
#define PhiVisitDataTypeCallbackTiny(cpp_type, data_type) \
do { \
if (type == data_type) { \
visitor.template apply<cpp_type>(); \
return; \
} \
} while (0)
_PhiForEachDataTypeTiny_
(
PhiVisitDataTypeCallbackTiny
);
#undef PhiVisitDataTypeCallbackTiny
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"Not supported phi::DataType(%d) as data type."
,
static_cast
<
int
>
(
type
)));
}
}
// namespace phi
}
// namespace phi
paddle/phi/infermeta/multiary.cc
浏览文件 @
74894cd7
...
@@ -1167,6 +1167,52 @@ void RnnInferMeta(const MetaTensor& x,
...
@@ -1167,6 +1167,52 @@ void RnnInferMeta(const MetaTensor& x,
}
}
}
}
void
StackInferMeta
(
const
std
::
vector
<
MetaTensor
*>&
x
,
int
axis
,
MetaTensor
*
out
)
{
PADDLE_ENFORCE_GT
(
x
.
size
(),
0UL
,
phi
::
errors
::
InvalidArgument
(
"Number of Inputs(x) must be larger than 0, but"
" received value is:%d."
,
x
.
size
()));
const
auto
&
input_dims
=
GetMetaTensorsDim
(
x
);
for
(
size_t
i
=
1
;
i
<
input_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
input_dims
[
i
],
input_dims
[
0
],
phi
::
errors
::
InvalidArgument
(
"Dims of all Inputs(X) must be the same, but"
" received input %d dim is:%d not equal to input 0"
" dim:%d."
,
i
,
input_dims
[
i
],
input_dims
[
0
]));
}
int
rank
=
input_dims
[
0
].
size
();
PADDLE_ENFORCE_GE
(
axis
,
-
(
rank
+
1
),
phi
::
errors
::
InvalidArgument
(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d."
,
rank
,
axis
));
PADDLE_ENFORCE_LT
(
axis
,
rank
+
1
,
phi
::
errors
::
InvalidArgument
(
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, "
"but received axis is:%d"
,
rank
,
axis
));
if
(
axis
<
0
)
axis
+=
(
rank
+
1
);
auto
vec
=
phi
::
vectorize
<
int
>
(
input_dims
[
0
]);
vec
.
insert
(
vec
.
begin
()
+
axis
,
input_dims
.
size
());
out
->
set_dims
(
phi
::
make_ddim
(
vec
));
out
->
set_dtype
(
x
.
at
(
0
)
->
dtype
());
out
->
share_lod
(
*
x
.
at
(
0
));
}
void
WarpctcInferMeta
(
const
MetaTensor
&
logits
,
void
WarpctcInferMeta
(
const
MetaTensor
&
logits
,
const
MetaTensor
&
label
,
const
MetaTensor
&
label
,
const
paddle
::
optional
<
const
MetaTensor
&>
logits_length
,
const
paddle
::
optional
<
const
MetaTensor
&>
logits_length
,
...
...
paddle/phi/infermeta/multiary.h
浏览文件 @
74894cd7
...
@@ -231,6 +231,10 @@ void RnnInferMeta(const MetaTensor& x,
...
@@ -231,6 +231,10 @@ void RnnInferMeta(const MetaTensor& x,
std
::
vector
<
MetaTensor
*>
state
,
std
::
vector
<
MetaTensor
*>
state
,
MetaTensor
*
reserve
);
MetaTensor
*
reserve
);
void
StackInferMeta
(
const
std
::
vector
<
MetaTensor
*>&
x
,
int
axis
,
MetaTensor
*
out
);
void
WarpctcInferMeta
(
const
MetaTensor
&
logits
,
void
WarpctcInferMeta
(
const
MetaTensor
&
logits
,
const
MetaTensor
&
label
,
const
MetaTensor
&
label
,
const
paddle
::
optional
<
const
MetaTensor
&>
logits_length
,
const
paddle
::
optional
<
const
MetaTensor
&>
logits_length
,
...
...
paddle/phi/infermeta/ternary.cc
浏览文件 @
74894cd7
...
@@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
...
@@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
out
->
set_dtype
(
x
.
dtype
());
out
->
set_dtype
(
x
.
dtype
());
}
}
void
RangeInferMeta
(
const
MetaTensor
&
start
,
const
MetaTensor
&
end
,
const
MetaTensor
&
step
,
MetaTensor
*
out
)
{
auto
start_dims
=
start
.
dims
();
auto
end_dims
=
end
.
dims
();
auto
step_dims
=
step
.
dims
();
PADDLE_ENFORCE_EQ
(
start_dims
.
size
(),
1
,
phi
::
errors
::
InvalidArgument
(
"The dim of the shape of Input(Start) should be 1, but got %d"
,
start_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
start_dims
[
0
],
1
,
phi
::
errors
::
InvalidArgument
(
"The first dim of the shape of Input(Start) should "
"be 1, but got %d"
,
start_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
end_dims
.
size
(),
1
,
phi
::
errors
::
InvalidArgument
(
"The dim of the shape of Input(End) should be 1, but got %d"
,
end_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
end_dims
[
0
],
1
,
phi
::
errors
::
InvalidArgument
(
"The first dim of the shape of "
"Input(End) should be 1, but got %d"
,
end_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
step_dims
.
size
(),
1
,
phi
::
errors
::
InvalidArgument
(
"The dim of the shape of Input(Step) should be 1, but got %d"
,
step_dims
.
size
()));
PADDLE_ENFORCE_EQ
(
step_dims
[
0
],
1
,
phi
::
errors
::
InvalidArgument
(
"The first dim of the shape of Input(Step) should "
"be 1, but got %d"
,
step_dims
[
0
]));
out
->
set_dims
({
-
1
});
out
->
set_dtype
(
start
.
dtype
());
}
void
RoiAlignInferMeta
(
const
MetaTensor
&
x
,
void
RoiAlignInferMeta
(
const
MetaTensor
&
x
,
const
MetaTensor
&
boxes
,
const
MetaTensor
&
boxes
,
paddle
::
optional
<
const
MetaTensor
&>
boxes_num
,
paddle
::
optional
<
const
MetaTensor
&>
boxes_num
,
...
...
paddle/phi/infermeta/ternary.h
浏览文件 @
74894cd7
...
@@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
...
@@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
const
std
::
string
&
reduce
,
const
std
::
string
&
reduce
,
MetaTensor
*
out
);
MetaTensor
*
out
);
void
RangeInferMeta
(
const
MetaTensor
&
start
,
const
MetaTensor
&
end
,
const
MetaTensor
&
step
,
MetaTensor
*
out
);
void
RoiAlignInferMeta
(
const
MetaTensor
&
x
,
void
RoiAlignInferMeta
(
const
MetaTensor
&
x
,
const
MetaTensor
&
boxes
,
const
MetaTensor
&
boxes
,
paddle
::
optional
<
const
MetaTensor
&>
boxes_num
,
paddle
::
optional
<
const
MetaTensor
&>
boxes_num
,
...
...
paddle/phi/infermeta/unary.cc
浏览文件 @
74894cd7
...
@@ -2552,6 +2552,85 @@ void UnfoldInferMeta(const MetaTensor& x,
...
@@ -2552,6 +2552,85 @@ void UnfoldInferMeta(const MetaTensor& x,
out
->
set_dims
(
phi
::
make_ddim
(
out_dims
));
out
->
set_dims
(
phi
::
make_ddim
(
out_dims
));
}
}
void
UniqueInferMeta
(
const
MetaTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
MetaTensor
*
out
,
MetaTensor
*
indices
,
MetaTensor
*
index
,
MetaTensor
*
counts
)
{
bool
is_sorted
=
true
;
UniqueRawInferMeta
(
x
,
return_index
,
return_inverse
,
return_counts
,
axis
,
dtype
,
is_sorted
,
out
,
indices
,
index
,
counts
);
}
void
UniqueRawInferMeta
(
const
MetaTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
bool
is_sorted
,
MetaTensor
*
out
,
MetaTensor
*
indices
,
MetaTensor
*
index
,
MetaTensor
*
counts
)
{
if
(
!
is_sorted
)
{
PADDLE_ENFORCE_EQ
(
x
.
dims
().
size
(),
1
,
phi
::
errors
::
InvalidArgument
(
"The Input(X) should be 1-D Tensor, "
"But now the dims of Input(X) is %d."
,
x
.
dims
().
size
()));
out
->
set_dims
(
phi
::
make_ddim
({
-
1
}));
index
->
set_dims
(
x
.
dims
());
return
;
}
if
(
axis
.
empty
())
{
out
->
set_dims
(
phi
::
make_ddim
({
-
1
}));
if
(
return_inverse
)
{
index
->
set_dims
(
phi
::
make_ddim
({
phi
::
product
(
x
.
dims
())}));
}
}
else
{
int
axis_value
=
axis
[
0
];
if
(
axis_value
<
0
)
{
axis_value
+=
x
.
dims
().
size
();
}
PADDLE_ENFORCE_LT
(
axis_value
,
x
.
dims
().
size
(),
phi
::
errors
::
InvalidArgument
(
"The axis(%d) should be less than "
"the dimension size(%d) of x."
,
axis_value
,
x
.
dims
().
size
()));
auto
out_dims
=
x
.
dims
();
out_dims
[
axis_value
]
=
-
1
;
out
->
set_dims
(
out_dims
);
if
(
return_inverse
)
{
index
->
set_dims
(
phi
::
make_ddim
({
x
.
dims
()[
axis_value
]}));
}
}
if
(
return_index
)
{
indices
->
set_dims
(
phi
::
make_ddim
({
-
1
}));
}
if
(
return_counts
)
{
counts
->
set_dims
(
phi
::
make_ddim
({
-
1
}));
}
}
void
UnsqueezeInferMeta
(
const
MetaTensor
&
x
,
void
UnsqueezeInferMeta
(
const
MetaTensor
&
x
,
const
IntArray
&
axes
,
const
IntArray
&
axes
,
MetaTensor
*
xshape
,
MetaTensor
*
xshape
,
...
@@ -2595,6 +2674,53 @@ void UnsqueezeInferMeta(const MetaTensor& x,
...
@@ -2595,6 +2674,53 @@ void UnsqueezeInferMeta(const MetaTensor& x,
xshape
->
set_dtype
(
x
.
dtype
());
xshape
->
set_dtype
(
x
.
dtype
());
}
}
void
UnStackInferMeta
(
const
MetaTensor
&
x
,
int
axis
,
int
num
,
std
::
vector
<
MetaTensor
*>
outs
)
{
auto
x_dim
=
x
.
dims
();
int
rank
=
x_dim
.
size
();
PADDLE_ENFORCE_GE
(
axis
,
-
rank
,
phi
::
errors
::
InvalidArgument
(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d"
,
rank
));
PADDLE_ENFORCE_LT
(
axis
,
rank
,
phi
::
errors
::
InvalidArgument
(
"The attribute axis is out of range, it must be "
"inside [-rank, rank), where rank = %d"
,
rank
));
if
(
axis
<
0
)
axis
+=
rank
;
size_t
output_count
=
outs
.
size
();
PADDLE_ENFORCE_EQ
(
output_count
,
static_cast
<
size_t
>
(
num
),
phi
::
errors
::
InvalidArgument
(
"Number of Outputs(Y) is wrong. Got %d , but it must "
"equal to attribute num which is %d."
,
output_count
,
static_cast
<
size_t
>
(
num
)));
if
(
x_dim
[
axis
]
>
0
)
{
PADDLE_ENFORCE_EQ
(
num
,
x_dim
[
axis
],
phi
::
errors
::
InvalidArgument
(
"The number of attribute num is not equal to the length of the "
"%d axis of Input(X). Expect %d but got %d."
,
axis
,
x_dim
[
axis
],
num
));
}
auto
vec
=
phi
::
vectorize
<
int
>
(
x_dim
);
vec
.
erase
(
vec
.
begin
()
+
axis
);
for
(
size_t
i
=
0
;
i
<
output_count
;
i
++
)
{
outs
[
i
]
->
set_dims
(
phi
::
make_ddim
(
vec
));
outs
[
i
]
->
set_dtype
(
x
.
dtype
());
}
}
void
OneHotRawInferMeta
(
const
MetaTensor
&
x
,
void
OneHotRawInferMeta
(
const
MetaTensor
&
x
,
int32_t
depth
,
int32_t
depth
,
DataType
dtype
,
DataType
dtype
,
...
...
paddle/phi/infermeta/unary.h
浏览文件 @
74894cd7
...
@@ -360,12 +360,40 @@ void UnfoldInferMeta(const MetaTensor& x,
...
@@ -360,12 +360,40 @@ void UnfoldInferMeta(const MetaTensor& x,
MetaTensor
*
out
,
MetaTensor
*
out
,
MetaConfig
config
=
MetaConfig
());
MetaConfig
config
=
MetaConfig
());
void
UniqueInferMeta
(
const
MetaTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
MetaTensor
*
out
,
MetaTensor
*
indices
,
MetaTensor
*
index
,
MetaTensor
*
counts
);
void
UniqueRawInferMeta
(
const
MetaTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
bool
is_sorted
,
MetaTensor
*
out
,
MetaTensor
*
indices
,
MetaTensor
*
index
,
MetaTensor
*
counts
);
void
UnsqueezeInferMeta
(
const
MetaTensor
&
x
,
void
UnsqueezeInferMeta
(
const
MetaTensor
&
x
,
const
IntArray
&
axes
,
const
IntArray
&
axes
,
MetaTensor
*
xshape
,
MetaTensor
*
xshape
,
MetaTensor
*
out
,
MetaTensor
*
out
,
MetaConfig
config
=
MetaConfig
());
MetaConfig
config
=
MetaConfig
());
void
UnStackInferMeta
(
const
MetaTensor
&
x
,
int
axis
,
int
num
,
std
::
vector
<
MetaTensor
*>
outs
);
void
OneHotRawInferMeta
(
const
MetaTensor
&
x
,
void
OneHotRawInferMeta
(
const
MetaTensor
&
x
,
int32_t
depth
,
int32_t
depth
,
DataType
dtype
,
DataType
dtype
,
...
...
paddle/phi/kernels/cpu/range_kernel.cc
0 → 100644
浏览文件 @
74894cd7
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/range_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/range_function.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
RangeKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
start
,
const
DenseTensor
&
end
,
const
DenseTensor
&
step
,
DenseTensor
*
out
)
{
T
start_value
=
start
.
data
<
T
>
()[
0
];
T
end_value
=
end
.
data
<
T
>
()[
0
];
T
step_value
=
step
.
data
<
T
>
()[
0
];
int64_t
size
=
0
;
phi
::
funcs
::
GetSize
(
start_value
,
end_value
,
step_value
,
&
size
);
out
->
Resize
(
phi
::
make_ddim
({
size
}));
T
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
T
value
=
start_value
;
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
out_data
[
i
]
=
value
;
value
+=
step_value
;
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
range
,
CPU
,
ALL_LAYOUT
,
phi
::
RangeKernel
,
float
,
double
,
int
,
int64_t
)
{}
paddle/phi/kernels/cpu/stack_grad_kernel.cc
0 → 100644
浏览文件 @
74894cd7
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/stack_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/stack_functor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
StackGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
int
axis
,
std
::
vector
<
DenseTensor
*>
x_grad
)
{
if
(
axis
<
0
)
axis
+=
out
.
dims
().
size
();
int
n
=
out
.
dims
()[
axis
];
std
::
vector
<
T
*>
dx_datas
(
n
);
// NOLINT
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
if
(
x_grad
[
i
]
==
nullptr
)
{
dx_datas
[
i
]
=
nullptr
;
}
else
{
dx_datas
[
i
]
=
dev_ctx
.
template
Alloc
<
T
>(
x_grad
[
i
]);
}
}
auto
dy_data
=
out
.
data
<
T
>
();
int
pre
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
out
.
dims
()[
i
];
int
total_num
=
out
.
numel
();
int
post
=
total_num
/
(
n
*
pre
);
auto
dx_data_arr
=
dx_datas
.
data
();
phi
::
funcs
::
StackGradFunctorForRange
(
dev_ctx
,
dx_data_arr
,
dy_data
,
total_num
,
n
,
post
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
stack_grad
,
CPU
,
ALL_LAYOUT
,
phi
::
StackGradKernel
,
float
,
double
,
int64_t
,
int
,
phi
::
dtype
::
bfloat16
)
{}
paddle/phi/kernels/cpu/stack_kernel.cc
0 → 100644
浏览文件 @
74894cd7
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/stack_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
StackKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
DenseTensor
*>&
x
,
int
axis
,
DenseTensor
*
out
)
{
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
n
=
static_cast
<
int
>
(
x
.
size
());
T
*
y_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
std
::
vector
<
const
T
*>
x_datas
(
n
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
x_datas
[
i
]
=
x
[
i
]
->
data
<
T
>
();
int
pre
=
1
,
post
=
1
;
auto
&
dim
=
x
[
0
]
->
dims
();
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
auto
x_data_arr
=
x_datas
.
data
();
size_t
x_offset
=
0
;
size_t
y_offset
=
0
;
for
(
int
i
=
0
;
i
<
pre
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
std
::
memcpy
(
y_data
+
y_offset
,
x_data_arr
[
j
]
+
x_offset
,
post
*
sizeof
(
T
));
y_offset
+=
post
;
}
x_offset
+=
post
;
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
stack
,
CPU
,
ALL_LAYOUT
,
phi
::
StackKernel
,
float
,
double
,
int
,
int64_t
,
phi
::
dtype
::
bfloat16
)
{}
paddle/phi/kernels/cpu/unique_kernel.cc
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unique_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/unique_functor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
UniqueKernel
(
const
Context
&
context
,
const
DenseTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
)
{
bool
is_sorted
=
true
;
UniqueRawKernel
<
T
,
Context
>
(
context
,
x
,
return_index
,
return_inverse
,
return_counts
,
axis
,
dtype
,
is_sorted
,
out
,
indices
,
index
,
counts
);
}
template
<
typename
T
,
typename
Context
>
void
UniqueRawKernel
(
const
Context
&
context
,
const
DenseTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
bool
is_sorted
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
)
{
if
(
dtype
==
phi
::
DataType
::
INT32
)
{
PADDLE_ENFORCE_LE
(
x
.
numel
(),
INT_MAX
,
phi
::
errors
::
InvalidArgument
(
"The number of elements in Input(X) should be less than or "
"equal to INT_MAX, but received num is %d. Please set `dtype` to "
"int64."
,
x
.
numel
()));
}
if
(
!
is_sorted
)
{
phi
::
VisitDataType
(
dtype
,
phi
::
funcs
::
UniqueOpFunctor
<
Context
,
T
>
(
context
,
out
,
index
,
&
x
));
return
;
}
if
(
x
.
numel
()
==
0
)
{
context
.
template
Alloc
<
T
>(
out
);
return
;
}
if
(
axis
.
empty
())
{
phi
::
VisitDataTypeTiny
(
dtype
,
phi
::
funcs
::
UniqueFlattendTensorFunctor
<
Context
,
T
>
(
context
,
x
,
out
,
indices
,
index
,
counts
,
return_index
,
return_inverse
,
return_counts
));
}
else
{
int
axis_value
=
axis
[
0
];
phi
::
VisitDataTypeTiny
(
dtype
,
phi
::
funcs
::
UniqueDimFunctor
<
Context
,
T
>
(
context
,
x
,
out
,
indices
,
index
,
counts
,
axis_value
,
return_index
,
return_inverse
,
return_counts
));
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
unique
,
CPU
,
ALL_LAYOUT
,
phi
::
UniqueKernel
,
float
,
double
,
int32_t
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
unique_raw
,
CPU
,
ALL_LAYOUT
,
phi
::
UniqueRawKernel
,
float
,
double
,
int32_t
,
int64_t
)
{}
paddle/
fluid/operators/unstack_op.cu
→
paddle/
phi/kernels/cpu/unstack_grad_kernel.cc
浏览文件 @
74894cd7
/* Copyright (c) 20
19
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 20
22
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
...
@@ -12,21 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,21 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/unstack_op.h"
#include "paddle/phi/kernels/unstack_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
namespace
plat
=
paddle
::
platform
;
PD_REGISTER_KERNEL
(
unstack_grad
,
namespace
ops
=
paddle
::
operators
;
CPU
,
ALL_LAYOUT
,
REGISTER_OP_CUDA_KERNEL
(
phi
::
UnStackGradKernel
,
unstack
,
ops
::
UnStackKernel
<
plat
::
CUDADeviceContext
,
float
>
,
float
,
ops
::
UnStackKernel
<
plat
::
CUDADeviceContext
,
double
>
,
double
,
ops
::
UnStackKernel
<
plat
::
CUDADeviceContext
,
int
>
,
int
,
ops
::
UnStackKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
int64_t
)
{}
ops
::
UnStackKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
unstack_grad
,
ops
::
UnStackGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
UnStackGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
UnStackGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
UnStackGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
UnStackGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/phi/kernels/cpu/unstack_kernel.cc
0 → 100644
浏览文件 @
74894cd7
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/unstack_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
PD_REGISTER_KERNEL
(
unstack
,
CPU
,
ALL_LAYOUT
,
phi
::
UnStackKernel
,
float
,
double
,
int
,
int64_t
)
{
}
paddle/phi/kernels/funcs/range_function.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/enforce.h"
namespace
phi
{
namespace
funcs
{
template
<
typename
T
>
void
GetSize
(
T
start
,
T
end
,
T
step
,
int64_t
*
size
)
{
PADDLE_ENFORCE_NE
(
step
,
0
,
phi
::
errors
::
InvalidArgument
(
"The step of range op should not be 0."
));
if
(
start
<
end
)
{
PADDLE_ENFORCE_GT
(
step
,
0
,
phi
::
errors
::
InvalidArgument
(
"The step should be greater than 0 while start < end."
));
}
if
(
start
>
end
)
{
PADDLE_ENFORCE_LT
(
step
,
0
,
phi
::
errors
::
InvalidArgument
(
"The step should be less than 0 while start > end."
));
}
*
size
=
std
::
is_integral
<
T
>::
value
?
((
std
::
abs
(
end
-
start
)
+
std
::
abs
(
step
)
-
1
)
/
std
::
abs
(
step
))
:
std
::
ceil
(
std
::
abs
((
end
-
start
)
/
step
));
}
}
// namespace funcs
}
// namespace phi
paddle/
fluid/operators/stack_op
.h
→
paddle/
phi/kernels/funcs/stack_functor
.h
浏览文件 @
74894cd7
// Copyright (c) 20
18
PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 20
22
PaddlePaddle Authors. All Rights Reserved.
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// you may not use this file except in compliance with the License.
...
@@ -14,12 +14,29 @@
...
@@ -14,12 +14,29 @@
#pragma once
#pragma once
#include <memory>
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
namespace
funcs
{
template
<
typename
VecXType
,
typename
T
>
struct
StackFunctor
{
HOSTDEVICE
StackFunctor
(
const
VecXType
&
x
,
T
*
y
,
int
n
,
int
post
)
:
x_
(
x
),
y_
(
y
),
n_
(
n
),
post_
(
post
)
{}
HOSTDEVICE
void
operator
()(
int
idx
)
{
int
i
=
idx
/
(
n_
*
post_
);
int
which_x
=
idx
/
post_
-
i
*
n_
;
int
x_index
=
i
*
post_
+
idx
%
post_
;
y_
[
idx
]
=
x_
[
which_x
][
x_index
];
}
private:
VecXType
x_
;
T
*
y_
;
int
n_
;
int
post_
;
};
template
<
typename
VecDxType
,
typename
T
>
template
<
typename
VecDxType
,
typename
T
>
struct
StackGradFunctor
{
struct
StackGradFunctor
{
...
@@ -40,81 +57,27 @@ struct StackGradFunctor {
...
@@ -40,81 +57,27 @@ struct StackGradFunctor {
int
post_
;
int
post_
;
};
};
template
<
typename
DeviceContext
,
typename
VecXType
,
typename
T
>
static
inline
void
StackFunctorForRange
(
const
DeviceContext
&
ctx
,
const
VecXType
&
x
,
T
*
y
,
int
total_num
,
int
n
,
int
post
)
{
phi
::
funcs
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
total_num
);
for_range
(
StackFunctor
<
VecXType
,
T
>
(
x
,
y
,
n
,
post
));
}
template
<
typename
DeviceContext
,
typename
VecDxType
,
typename
T
>
template
<
typename
DeviceContext
,
typename
VecDxType
,
typename
T
>
static
inline
void
StackGradFunctorForRange
(
const
DeviceContext
&
ctx
,
static
inline
void
StackGradFunctorForRange
(
const
DeviceContext
&
ctx
,
const
VecDxType
&
dx
,
const
T
*
dy
,
const
VecDxType
&
dx
,
int
total_num
,
int
n
,
int
post
)
{
const
T
*
dy
,
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
total_num
);
int
total_num
,
int
n
,
int
post
)
{
phi
::
funcs
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
total_num
);
for_range
(
StackGradFunctor
<
VecDxType
,
T
>
(
dx
,
dy
,
n
,
post
));
for_range
(
StackGradFunctor
<
VecDxType
,
T
>
(
dx
,
dy
,
n
,
post
));
}
}
template
<
typename
DeviceContext
,
typename
T
>
}
// namespace funcs
class
StackKernel
:
public
framework
::
OpKernel
<
T
>
{
}
// namespace phi
using
Tensor
=
framework
::
LoDTensor
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
x
=
ctx
.
MultiInput
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
n
=
static_cast
<
int
>
(
x
.
size
());
auto
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
const
T
*>
x_datas
(
n
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
x_datas
[
i
]
=
x
[
i
]
->
data
<
T
>
();
int
pre
=
1
,
post
=
1
;
auto
&
dim
=
x
[
0
]
->
dims
();
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
auto
x_data_arr
=
x_datas
.
data
();
size_t
x_offset
=
0
;
size_t
y_offset
=
0
;
for
(
int
i
=
0
;
i
<
pre
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
std
::
memcpy
(
y_data
+
y_offset
,
x_data_arr
[
j
]
+
x_offset
,
post
*
sizeof
(
T
));
y_offset
+=
post
;
}
x_offset
+=
post
;
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
StackGradKernel
:
public
framework
::
OpKernel
<
T
>
{
using
Tensor
=
framework
::
LoDTensor
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
dy
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
dx
=
ctx
.
MultiOutput
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
dy
->
dims
().
size
();
int
n
=
dy
->
dims
()[
axis
];
std
::
vector
<
T
*>
dx_datas
(
n
);
// NOLINT
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
if
(
dx
[
i
]
==
nullptr
)
{
dx_datas
[
i
]
=
nullptr
;
}
else
{
dx_datas
[
i
]
=
dx
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
}
auto
dy_data
=
dy
->
data
<
T
>
();
int
pre
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dy
->
dims
()[
i
];
int
total_num
=
dy
->
numel
();
int
post
=
total_num
/
(
n
*
pre
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
dx_data_arr
=
dx_datas
.
data
();
StackGradFunctorForRange
(
dev_ctx
,
dx_data_arr
,
dy_data
,
total_num
,
n
,
post
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/phi/kernels/funcs/unique_functor.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
namespace
funcs
{
template
<
typename
Context
,
typename
InT
>
struct
UniqueOpFunctor
{
const
Context
&
context_
;
DenseTensor
*
out_
;
DenseTensor
*
index_
;
const
DenseTensor
*
in_
;
DenseTensor
*
count_
;
UniqueOpFunctor
(
const
Context
&
context
,
DenseTensor
*
out
,
DenseTensor
*
index
,
const
DenseTensor
*
in
,
DenseTensor
*
count
=
nullptr
)
:
context_
(
context
),
out_
(
out
),
index_
(
index
),
in_
(
in
),
count_
(
count
)
{}
template
<
typename
IndexT
>
void
apply
()
const
{
auto
*
in_data
=
in_
->
data
<
InT
>
();
auto
*
index_data
=
context_
.
template
Alloc
<
IndexT
>(
index_
);
int64_t
j
=
0
;
// TODO(fangzeyang): Should optimize performance here.
std
::
unordered_map
<
InT
,
int64_t
>
dict
;
std
::
vector
<
InT
>
uniq
;
PADDLE_ENFORCE_LT
(
in_
->
numel
(),
pow
(
2
,
31
),
phi
::
errors
::
InvalidArgument
(
"The num of Input(X) elements should be less then INT_MAX, "
"but received num is %d."
,
in_
->
numel
()));
for
(
auto
i
=
0
;
i
<
in_
->
numel
();
i
++
)
{
auto
it
=
dict
.
find
(
in_data
[
i
]);
if
(
it
==
dict
.
end
())
{
dict
.
emplace
(
std
::
make_pair
(
in_data
[
i
],
j
));
uniq
.
emplace_back
(
in_data
[
i
]);
index_data
[
i
]
=
static_cast
<
IndexT
>
(
j
);
j
++
;
}
else
{
index_data
[
i
]
=
static_cast
<
IndexT
>
(
it
->
second
);
}
}
if
(
count_
!=
nullptr
)
{
// Resize the count tensor dims to allocate the memory
count_
->
Resize
(
phi
::
make_ddim
({
static_cast
<
int64_t
>
(
uniq
.
size
())}));
IndexT
*
count_data
=
context_
.
template
Alloc
<
IndexT
>(
count_
);
// init count_data to 0
memset
(
count_data
,
0
,
uniq
.
size
()
*
sizeof
(
IndexT
));
const
auto
&
index_type
=
index_
->
dtype
();
bool
index_type_match
=
index_type
==
DataType
::
INT32
||
index_type
==
DataType
::
INT64
;
PADDLE_ENFORCE_EQ
(
index_type_match
,
true
,
phi
::
errors
::
InvalidArgument
(
"Index holds the wrong type, it holds %s, "
"but desires to be %s or %s"
,
paddle
::
framework
::
DataTypeToString
(
paddle
::
framework
::
TransToProtoVarType
(
index_type
)),
paddle
::
framework
::
DataTypeToString
(
paddle
::
framework
::
TransToProtoVarType
(
DataType
::
INT32
)),
paddle
::
framework
::
DataTypeToString
(
paddle
::
framework
::
TransToProtoVarType
(
DataType
::
INT64
))));
if
(
index_type
==
DataType
::
INT32
)
{
for
(
auto
i
=
0
;
i
<
in_
->
numel
();
++
i
)
{
const
IndexT
&
index
=
index_data
[
i
];
count_data
[
static_cast
<
int32_t
>
(
index
)]
+=
static_cast
<
IndexT
>
(
1
);
}
}
else
{
for
(
auto
i
=
0
;
i
<
in_
->
numel
();
++
i
)
{
const
IndexT
&
index
=
index_data
[
i
];
count_data
[
static_cast
<
int64_t
>
(
index
)]
+=
static_cast
<
IndexT
>
(
1
);
}
}
}
out_
->
Resize
(
phi
::
make_ddim
({
static_cast
<
int64_t
>
(
uniq
.
size
())}));
auto
*
out_data
=
context_
.
template
Alloc
<
InT
>(
out_
);
std
::
memcpy
(
out_data
,
uniq
.
data
(),
uniq
.
size
()
*
sizeof
(
InT
));
}
};
static
std
::
vector
<
DenseTensor
>
Unbind
(
const
DenseTensor
&
in
)
{
int64_t
size
=
in
.
dims
()[
0
];
std
::
vector
<
DenseTensor
>
tensors
(
size
);
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
tensors
[
i
]
=
in
.
Slice
(
i
,
i
+
1
);
}
return
tensors
;
}
template
<
typename
T
>
static
bool
Equal
(
const
DenseTensor
&
a
,
const
DenseTensor
&
b
)
{
if
(
a
.
numel
()
!=
b
.
numel
())
{
return
false
;
}
for
(
int64_t
i
=
0
;
i
<
a
.
numel
();
++
i
)
{
if
(
a
.
data
<
T
>
()[
i
]
!=
b
.
data
<
T
>
()[
i
])
{
return
false
;
}
}
return
true
;
}
template
<
typename
Context
,
typename
InT
,
typename
IndexT
>
static
void
UniqueFlattendTensor
(
const
Context
&
context
,
const
DenseTensor
&
in
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
count
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
)
{
const
InT
*
in_data
=
in
.
data
<
InT
>
();
std
::
set
<
InT
>
unique
(
in_data
,
in_data
+
in
.
numel
());
out
->
Resize
(
phi
::
make_ddim
({
static_cast
<
int64_t
>
(
unique
.
size
())}));
auto
*
out_data
=
context
.
template
Alloc
<
InT
>(
out
);
std
::
copy
(
unique
.
begin
(),
unique
.
end
(),
out_data
);
if
(
return_index
)
{
indices
->
Resize
(
phi
::
make_ddim
({
out
->
numel
()}));
auto
indices_data
=
context
.
template
Alloc
<
IndexT
>(
indices
);
std
::
unordered_map
<
InT
,
IndexT
>
indices_map
;
indices_map
.
reserve
(
out
->
numel
());
for
(
int64_t
i
=
0
;
i
<
in
.
numel
();
++
i
)
{
if
(
indices_map
.
find
(
in_data
[
i
])
!=
indices_map
.
end
())
continue
;
indices_map
[
in_data
[
i
]]
=
i
;
}
for
(
int64_t
i
=
0
;
i
<
out
->
numel
();
++
i
)
{
indices_data
[
i
]
=
indices_map
[
out_data
[
i
]];
}
}
if
(
return_inverse
)
{
index
->
Resize
(
phi
::
make_ddim
({
in
.
numel
()}));
auto
inverse_data
=
context
.
template
Alloc
<
IndexT
>(
index
);
std
::
unordered_map
<
InT
,
IndexT
>
inverse_map
;
inverse_map
.
reserve
(
out
->
numel
());
for
(
int64_t
i
=
0
;
i
<
out
->
numel
();
++
i
)
{
inverse_map
[
out_data
[
i
]]
=
i
;
}
for
(
int64_t
i
=
0
;
i
<
in
.
numel
();
++
i
)
{
inverse_data
[
i
]
=
inverse_map
[
in_data
[
i
]];
}
}
if
(
return_counts
)
{
count
->
Resize
(
phi
::
make_ddim
({
out
->
numel
()}));
auto
count_data
=
context
.
template
Alloc
<
IndexT
>(
count
);
std
::
unordered_map
<
InT
,
IndexT
>
counts_map
;
counts_map
.
reserve
(
out
->
numel
());
for
(
int64_t
i
=
0
;
i
<
out
->
numel
();
++
i
)
{
counts_map
[
out_data
[
i
]]
=
0
;
}
for
(
int64_t
i
=
0
;
i
<
in
.
numel
();
i
++
)
{
counts_map
[
in_data
[
i
]]
+=
1
;
}
for
(
int64_t
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
count_data
[
i
]
=
counts_map
[
out_data
[
i
]];
}
}
}
template
<
typename
Context
,
typename
ForwardIt
,
typename
InT
,
typename
IndexT
>
static
ForwardIt
UniqueDimImpl
(
const
Context
&
context
,
ForwardIt
first
,
ForwardIt
last
,
const
std
::
vector
<
IndexT
>&
sorted_indices_vec
,
std
::
vector
<
IndexT
>*
inverse_vec
,
std
::
vector
<
IndexT
>*
counts_vec
,
std
::
vector
<
IndexT
>*
indices_vec
)
{
if
(
first
==
last
)
{
return
last
;
}
(
*
inverse_vec
)[
sorted_indices_vec
[
0
]]
=
0
;
(
*
counts_vec
)[
0
]
=
1
;
(
*
indices_vec
)[
0
]
=
sorted_indices_vec
[
0
];
ForwardIt
begin
=
first
;
ForwardIt
result
=
first
;
while
(
++
first
!=
last
)
{
int64_t
idx_first
=
std
::
distance
(
begin
,
first
);
int64_t
idx_result
=
std
::
distance
(
begin
,
result
);
if
(
!
Equal
<
InT
>
(
*
result
,
*
first
))
{
if
(
++
result
!=
first
)
{
*
result
=
std
::
move
(
*
first
);
}
idx_result
+=
1
;
(
*
indices_vec
)[
idx_result
]
=
sorted_indices_vec
[
idx_first
];
}
(
*
inverse_vec
)[
sorted_indices_vec
[
idx_first
]]
=
idx_result
;
(
*
counts_vec
)[
idx_result
]
+=
1
;
}
return
++
result
;
}
template
<
typename
Context
,
typename
InT
,
typename
IndexT
>
static
void
UniqueDim
(
const
Context
&
context
,
const
DenseTensor
&
in
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
count
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
int
axis
)
{
// transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
std
::
vector
<
int
>
permute
(
in
.
dims
().
size
());
std
::
iota
(
permute
.
begin
(),
permute
.
end
(),
0
);
permute
[
axis
]
=
0
;
permute
[
0
]
=
axis
;
std
::
vector
<
int64_t
>
in_trans_dims_vec
(
phi
::
vectorize
(
in
.
dims
()));
in_trans_dims_vec
[
axis
]
=
in
.
dims
()[
0
];
in_trans_dims_vec
[
0
]
=
in
.
dims
()[
axis
];
DenseTensor
in_trans
;
phi
::
DDim
in_trans_dims
=
phi
::
make_ddim
(
in_trans_dims_vec
);
in_trans
.
Resize
(
in_trans_dims
);
context
.
template
Alloc
<
InT
>(
&
in_trans
);
TransCompute
<
Context
,
InT
>
(
in
.
dims
().
size
(),
context
,
in
,
&
in_trans
,
permute
);
// reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
phi
::
DDim
in_trans_flat_dims
=
phi
::
flatten_to_2d
(
in_trans_dims
,
1
);
in_trans
.
Resize
(
in_trans_flat_dims
);
// sort indices
std
::
vector
<
IndexT
>
sorted_indices_vec
(
in_trans
.
dims
()[
0
]);
std
::
iota
(
sorted_indices_vec
.
begin
(),
sorted_indices_vec
.
end
(),
0
);
int64_t
col
=
in_trans
.
dims
()[
1
];
const
InT
*
in_trans_data
=
in_trans
.
data
<
InT
>
();
std
::
sort
(
sorted_indices_vec
.
begin
(),
sorted_indices_vec
.
end
(),
[
&
](
int64_t
a
,
int64_t
b
)
->
bool
{
for
(
int64_t
i
=
0
;
i
<
col
;
++
i
)
{
InT
lhs
=
in_trans_data
[
i
+
a
*
col
];
InT
rhs
=
in_trans_data
[
i
+
b
*
col
];
if
(
lhs
<
rhs
)
{
return
true
;
}
else
if
(
lhs
>
rhs
)
{
return
false
;
}
}
return
false
;
});
// sort tensor according to indices
DenseTensor
input_sorted
;
input_sorted
.
Resize
(
in_trans_dims
);
context
.
template
Alloc
<
InT
>(
&
input_sorted
);
InT
*
input_sorted_data
=
input_sorted
.
data
<
InT
>
();
for
(
size_t
i
=
0
;
i
<
sorted_indices_vec
.
size
();
++
i
)
{
memcpy
(
input_sorted_data
+
i
*
col
,
in_trans_data
+
static_cast
<
int64_t
>
(
sorted_indices_vec
[
i
])
*
col
,
col
*
sizeof
(
InT
));
}
std
::
vector
<
DenseTensor
>
input_unbind
=
Unbind
(
input_sorted
);
std
::
vector
<
IndexT
>
inverse_vec
(
sorted_indices_vec
.
size
(),
0
);
std
::
vector
<
IndexT
>
counts_vec
(
sorted_indices_vec
.
size
(),
0
);
std
::
vector
<
IndexT
>
indices_vec
(
sorted_indices_vec
.
size
(),
0
);
auto
last
=
UniqueDimImpl
<
Context
,
std
::
vector
<
DenseTensor
>::
iterator
,
InT
>
(
context
,
input_unbind
.
begin
(),
input_unbind
.
end
(),
sorted_indices_vec
,
&
inverse_vec
,
&
counts_vec
,
&
indices_vec
);
input_unbind
.
erase
(
last
,
input_unbind
.
end
());
counts_vec
.
erase
(
counts_vec
.
begin
()
+
input_unbind
.
size
(),
counts_vec
.
end
());
indices_vec
.
erase
(
indices_vec
.
begin
()
+
input_unbind
.
size
(),
indices_vec
.
end
());
phi
::
funcs
::
ConcatFunctor
<
Context
,
InT
>
concat_functor
;
DenseTensor
out_trans
;
std
::
vector
<
int64_t
>
out_trans_dims_vec
=
in_trans_dims_vec
;
out_trans_dims_vec
[
0
]
=
input_unbind
.
size
();
out_trans
.
Resize
(
phi
::
make_ddim
(
out_trans_dims_vec
));
context
.
template
Alloc
<
InT
>(
&
out_trans
);
std
::
swap
(
out_trans_dims_vec
[
0
],
out_trans_dims_vec
[
axis
]);
out
->
Resize
(
phi
::
make_ddim
(
out_trans_dims_vec
));
context
.
template
Alloc
<
InT
>(
out
);
concat_functor
(
context
,
input_unbind
,
0
,
&
out_trans
);
TransCompute
<
Context
,
InT
>
(
out_trans
.
dims
().
size
(),
context
,
out_trans
,
out
,
permute
);
if
(
return_inverse
)
{
paddle
::
framework
::
TensorFromVector
(
inverse_vec
,
context
,
index
);
}
if
(
return_counts
)
{
paddle
::
framework
::
TensorFromVector
(
counts_vec
,
context
,
count
);
}
if
(
return_index
)
{
paddle
::
framework
::
TensorFromVector
(
indices_vec
,
context
,
indices
);
}
}
template
<
typename
Context
,
typename
InT
>
struct
UniqueFlattendTensorFunctor
{
const
Context
&
ctx_
;
/* */
const
DenseTensor
&
in_
;
DenseTensor
*
out_
;
DenseTensor
*
indices_
;
DenseTensor
*
index_
;
DenseTensor
*
count_
;
const
bool
return_index_
;
const
bool
return_inverse_
;
const
bool
return_counts_
;
UniqueFlattendTensorFunctor
(
const
Context
&
context
,
const
DenseTensor
&
in
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
count
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
)
:
ctx_
(
context
),
in_
(
in
),
out_
(
out
),
indices_
(
indices
),
index_
(
index
),
count_
(
count
),
return_index_
(
return_index
),
return_inverse_
(
return_inverse
),
return_counts_
(
return_counts
)
{}
template
<
typename
IndexT
>
void
apply
()
const
{
UniqueFlattendTensor
<
Context
,
InT
,
IndexT
>
(
ctx_
,
in_
,
out_
,
indices_
,
index_
,
count_
,
return_index_
,
return_inverse_
,
return_counts_
);
}
};
template
<
typename
Context
,
typename
InT
>
struct
UniqueDimFunctor
{
const
Context
&
ctx_
;
const
DenseTensor
&
in_
;
DenseTensor
*
out_
;
DenseTensor
*
indices_
;
DenseTensor
*
index_
;
DenseTensor
*
count_
;
const
int
axis_
;
const
bool
return_index_
;
const
bool
return_inverse_
;
const
bool
return_counts_
;
UniqueDimFunctor
(
const
Context
&
context
,
const
DenseTensor
&
in
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
count
,
const
int
axis
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
)
:
ctx_
(
context
),
in_
(
in
),
out_
(
out
),
indices_
(
indices
),
index_
(
index
),
count_
(
count
),
axis_
(
axis
),
return_index_
(
return_index
),
return_inverse_
(
return_inverse
),
return_counts_
(
return_counts
)
{}
template
<
typename
IndexT
>
void
apply
()
const
{
UniqueDim
<
Context
,
InT
,
IndexT
>
(
ctx_
,
in_
,
out_
,
indices_
,
index_
,
count_
,
return_index_
,
return_inverse_
,
return_counts_
,
axis_
);
}
};
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/gpu/range_kernel.cu
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/range_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/range_function.h"
namespace
phi
{
template
<
typename
T
>
__global__
void
Range
(
T
start
,
T
step
,
int64_t
size
,
T
*
out
)
{
CUDA_KERNEL_LOOP
(
index
,
size
)
{
out
[
index
]
=
start
+
step
*
index
;
}
}
template
<
typename
T
,
typename
Context
>
void
RangeKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
start
,
const
DenseTensor
&
end
,
const
DenseTensor
&
step
,
DenseTensor
*
out
)
{
T
start_value
=
start
.
data
<
T
>
()[
0
];
T
end_value
=
end
.
data
<
T
>
()[
0
];
T
step_value
=
step
.
data
<
T
>
()[
0
];
int64_t
size
=
0
;
phi
::
funcs
::
GetSize
(
start_value
,
end_value
,
step_value
,
&
size
);
out
->
Resize
(
phi
::
make_ddim
({
size
}));
T
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
auto
stream
=
dev_ctx
.
stream
();
int
block
=
std
::
min
(
size
,
static_cast
<
int64_t
>
(
256
));
int
grid
=
(
size
+
block
-
1
)
/
block
;
Range
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
start_value
,
step_value
,
size
,
out_data
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
range
,
GPU
,
ALL_LAYOUT
,
phi
::
RangeKernel
,
float
,
double
,
int64_t
,
int
)
{
kernel
->
InputAt
(
0
).
SetBackend
(
phi
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
phi
::
Backend
::
CPU
);
kernel
->
InputAt
(
2
).
SetBackend
(
phi
::
Backend
::
CPU
);
}
paddle/phi/kernels/gpu/stack_grad_kernel.cu
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/stack_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
template
<
typename
T
,
typename
IntType
>
__global__
void
UnStackHelperCUDAKernel
(
const
T
*
__restrict__
input
,
int
pre_dim_size
,
int
split_dim_size
,
int
suf_dim_size
,
int
num_split
,
T
**
output_ptrs
)
{
assert
(
blockDim
.
y
==
1
);
assert
(
blockDim
.
z
==
1
);
// In this case they are equal
assert
(
split_dim_size
%
num_split
==
0
);
IntType
size
=
pre_dim_size
*
split_dim_size
*
suf_dim_size
;
IntType
each_dim_size
=
split_dim_size
/
num_split
;
for
(
IntType
offset
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
offset
<
size
;
offset
+=
blockDim
.
x
*
gridDim
.
x
)
{
IntType
i
=
offset
/
(
split_dim_size
*
suf_dim_size
);
IntType
j
=
(
offset
%
(
split_dim_size
*
suf_dim_size
))
/
suf_dim_size
;
IntType
k
=
offset
%
suf_dim_size
;
T
*
output
=
output_ptrs
[
j
/
each_dim_size
];
if
(
output
==
nullptr
)
{
return
;
}
IntType
output_ind
=
i
*
each_dim_size
*
suf_dim_size
+
(
j
%
each_dim_size
)
*
suf_dim_size
+
k
;
*
(
output
+
output_ind
)
=
input
[
offset
];
}
}
template
<
typename
T
,
typename
Context
>
void
StackGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
int
axis
,
std
::
vector
<
DenseTensor
*>
x_grad
)
{
if
(
axis
<
0
)
axis
+=
out
.
dims
().
size
();
int
n
=
out
.
dims
()[
axis
];
PADDLE_ENFORCE_EQ
(
n
,
x_grad
.
size
(),
phi
::
errors
::
InvalidArgument
(
"Output x_grad size should be equal to n, but"
" received n is:%d x_grad size is:%d."
,
n
,
x_grad
.
size
()));
// x_grad is output, so save each data address, then copy each dy into dx_data
std
::
vector
<
T
*>
outputs
(
n
);
for
(
size_t
j
=
0
;
j
<
x_grad
.
size
();
++
j
)
{
if
(
x_grad
[
j
]
==
nullptr
)
{
outputs
[
j
]
=
nullptr
;
continue
;
}
if
(
x_grad
[
j
]
->
numel
()
!=
0UL
)
{
T
*
ptr
=
dev_ctx
.
template
Alloc
<
T
>(
x_grad
[
j
]);
outputs
[
j
]
=
ptr
;
}
else
{
outputs
[
j
]
=
nullptr
;
}
}
auto
dy_data
=
out
.
data
<
T
>
();
// each x_grad should have same shape
int
dy_pre
=
1
,
dy_suf
=
1
;
auto
dy_dims
=
out
.
dims
();
int
split_dim
=
n
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
dy_pre
*=
dy_dims
[
i
];
}
dy_suf
=
out
.
numel
()
/
(
split_dim
*
dy_pre
);
auto
tmp_out_data
=
paddle
::
memory
::
Alloc
(
dev_ctx
,
outputs
.
size
()
*
sizeof
(
T
*
));
paddle
::
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_out_data
->
ptr
(),
phi
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
outputs
.
data
()),
outputs
.
size
()
*
sizeof
(
T
*
),
dev_ctx
.
stream
());
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
dy_pre
*
split_dim
*
dy_suf
);
if
(
out
.
numel
()
<
std
::
numeric_limits
<
int32_t
>::
max
())
{
UnStackHelperCUDAKernel
<
T
,
int32_t
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
dy_pre
,
split_dim
,
dy_suf
,
split_dim
,
reinterpret_cast
<
T
**>
(
tmp_out_data
->
ptr
()));
}
else
{
UnStackHelperCUDAKernel
<
T
,
int64_t
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
dy_pre
,
split_dim
,
dy_suf
,
split_dim
,
reinterpret_cast
<
T
**>
(
tmp_out_data
->
ptr
()));
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
stack_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
StackGradKernel
,
float
,
double
,
int64_t
,
int
,
phi
::
dtype
::
float16
,
phi
::
dtype
::
bfloat16
)
{}
paddle/phi/kernels/gpu/stack_kernel.cu
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/stack_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
template
<
typename
T
,
typename
IntType
>
__global__
void
StackCUDAKernel
(
T
**
input_ptrs
,
int
split_size
,
int
rows
,
int
cols
,
T
*
__restrict__
output
)
{
IntType
grid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
grid_x
<
cols
;
grid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
IntType
grid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
IntType
split
=
grid_x
/
split_size
;
const
T
*
input_ptr
=
input_ptrs
[
split
];
IntType
col_offset
=
grid_x
%
split_size
;
#pragma unroll
for
(;
grid_y
<
rows
;
grid_y
+=
blockDim
.
y
*
gridDim
.
y
)
{
output
[
grid_y
*
cols
+
grid_x
]
=
input_ptr
[
grid_y
*
split_size
+
col_offset
];
}
}
}
template
<
typename
T
,
typename
Context
>
void
StackKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
DenseTensor
*>&
x
,
int
axis
,
DenseTensor
*
out
)
{
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
n
=
static_cast
<
int
>
(
x
.
size
());
T
*
y_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
std
::
vector
<
const
T
*>
x_datas
(
n
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
x_datas
[
i
]
=
x
[
i
]
->
data
<
T
>
();
}
auto
tmp_x_data
=
paddle
::
memory
::
Alloc
(
dev_ctx
,
x_datas
.
size
()
*
sizeof
(
T
*
));
paddle
::
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_x_data
->
ptr
(),
phi
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
x_datas
.
data
()),
x_datas
.
size
()
*
sizeof
(
T
*
),
dev_ctx
.
stream
());
// Split x dim from axis to matrix
int
x_row
=
1
,
x_col
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
x_row
*=
x
[
0
]
->
dims
()[
i
];
}
x_col
=
x
[
0
]
->
numel
()
/
x_row
;
int
out_col
=
x_col
*
n
;
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig2D
(
dev_ctx
,
out_col
,
x_row
);
if
(
out
->
numel
()
<
std
::
numeric_limits
<
int32_t
>::
max
())
{
StackCUDAKernel
<
T
,
int32_t
><<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
x_col
,
x_row
,
out_col
,
y_data
);
}
else
{
StackCUDAKernel
<
T
,
int64_t
><<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
reinterpret_cast
<
T
**>
(
tmp_x_data
->
ptr
()),
x_col
,
x_row
,
out_col
,
y_data
);
}
}
}
// namespace phi
PD_REGISTER_KERNEL
(
stack
,
GPU
,
ALL_LAYOUT
,
phi
::
StackKernel
,
float
,
double
,
int64_t
,
int
,
phi
::
dtype
::
float16
,
phi
::
dtype
::
bfloat16
)
{}
paddle/
fluid/operators/unique_op
.cu
→
paddle/
phi/kernels/gpu/unique_kernel
.cu
浏览文件 @
74894cd7
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unique_kernel.h"
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/adjacent_difference.h>
#include <thrust/adjacent_difference.h>
#include <thrust/device_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/execution_policy.h>
...
@@ -21,12 +24,12 @@ limitations under the License. */
...
@@ -21,12 +24,12 @@ limitations under the License. */
#include <iostream>
#include <iostream>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h" // TensorToVector()
#include "paddle/fluid/framework/tensor_util.h" // TensorToVector()
#include "paddle/
fluid/operators/unique_op.h" // TransComute()
#include "paddle/
phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
paddle
{
#include "paddle/phi/kernels/copy_kernel.h"
namespace
operators
{
#include "paddle/phi/kernels/funcs/unique_functor.h"
using
Tensor
=
framework
::
Tensor
;
namespace
phi
{
// Binary function 'less than'
// Binary function 'less than'
template
<
typename
InT
>
template
<
typename
InT
>
...
@@ -93,10 +96,12 @@ struct BinaryNotEqual {
...
@@ -93,10 +96,12 @@ struct BinaryNotEqual {
}
}
};
};
// index_select() function for Tensor
// index_select() function for DenseTensor
template
<
typename
InT
,
typename
IndexT
>
template
<
typename
Context
,
typename
InT
,
typename
IndexT
>
void
IndexSelect
(
const
framework
::
ExecutionContext
&
context
,
void
IndexSelect
(
const
Context
&
context
,
const
Tensor
&
input
,
const
Tensor
&
index
,
Tensor
*
output
,
const
DenseTensor
&
input
,
const
DenseTensor
&
index
,
DenseTensor
*
output
,
int
dim
)
{
int
dim
)
{
auto
input_dim
=
input
.
dims
();
auto
input_dim
=
input
.
dims
();
auto
input_dim_size
=
input_dim
.
size
();
auto
input_dim_size
=
input_dim
.
size
();
...
@@ -119,27 +124,29 @@ void IndexSelect(const framework::ExecutionContext& context,
...
@@ -119,27 +124,29 @@ void IndexSelect(const framework::ExecutionContext& context,
std
::
vector
<
InT
>
input_vec
;
std
::
vector
<
InT
>
input_vec
;
std
::
vector
<
IndexT
>
index_vec
;
std
::
vector
<
IndexT
>
index_vec
;
paddle
::
framework
::
TensorToVector
(
input
,
context
.
device_context
(),
paddle
::
framework
::
TensorToVector
(
input
,
context
,
&
input_vec
);
&
input_vec
);
paddle
::
framework
::
TensorToVector
(
index
,
context
,
&
index_vec
);
paddle
::
framework
::
TensorToVector
(
index
,
context
.
device_context
(),
&
index_vec
);
std
::
vector
<
InT
>
out_vec
(
output
->
numel
());
std
::
vector
<
InT
>
out_vec
(
output
->
numel
());
for
(
int
i
=
0
;
i
<
index_size
;
i
++
)
{
for
(
int
i
=
0
;
i
<
index_size
;
i
++
)
{
PADDLE_ENFORCE_GE
(
PADDLE_ENFORCE_GE
(
index_vec
[
i
],
0
,
index_vec
[
i
],
platform
::
errors
::
InvalidArgument
(
0
,
phi
::
errors
::
InvalidArgument
(
"Variable value (index) of OP(index_select) "
"Variable value (index) of OP(index_select) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value."
,
"value."
,
input_dim
[
dim
],
index_vec
[
i
]));
input_dim
[
dim
],
index_vec
[
i
]));
PADDLE_ENFORCE_LT
(
PADDLE_ENFORCE_LT
(
index_vec
[
i
],
input_dim
[
dim
],
index_vec
[
i
],
platform
::
errors
::
InvalidArgument
(
input_dim
[
dim
],
phi
::
errors
::
InvalidArgument
(
"Variable value (index) of OP(index_select) "
"Variable value (index) of OP(index_select) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value."
,
"value."
,
input_dim
[
dim
],
index_vec
[
i
]));
input_dim
[
dim
],
index_vec
[
i
]));
}
}
for
(
auto
i
=
0
;
i
<
outer_nums
;
i
++
)
{
for
(
auto
i
=
0
;
i
<
outer_nums
;
i
++
)
{
...
@@ -154,150 +161,193 @@ void IndexSelect(const framework::ExecutionContext& context,
...
@@ -154,150 +161,193 @@ void IndexSelect(const framework::ExecutionContext& context,
}
}
}
}
}
}
output
->
mutable_data
<
InT
>
(
context
.
GetPlace
()
);
context
.
template
Alloc
<
IndexT
>(
output
);
framework
::
TensorFromVector
(
out_vec
,
context
.
device_context
()
,
output
);
paddle
::
framework
::
TensorFromVector
(
out_vec
,
context
,
output
);
output
->
Resize
(
output_dim
);
output
->
Resize
(
output_dim
);
}
}
// The core logic of computing Unique for a flattend Tensor
// The core logic of computing Unique for a flattend DenseTensor
template
<
typename
InT
,
typename
IndexT
,
typename
equal_T
,
typename
not_equal_T
>
template
<
typename
Context
,
static
void
UniqueFlattendCUDATensor
(
const
framework
::
ExecutionContext
&
context
,
typename
InT
,
const
Tensor
&
in
,
Tensor
*
out
,
typename
IndexT
,
bool
return_index
,
bool
return_inverse
,
typename
equal_T
,
bool
return_counts
,
equal_T
equal
,
typename
not_equal_T
>
not_equal_T
not_equal
,
int64_t
num_input
)
{
static
void
UniqueFlattendCUDATensor
(
const
Context
&
context
,
const
DenseTensor
&
in
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
equal_T
equal
,
not_equal_T
not_equal
,
int64_t
num_input
)
{
// 0. Prepration
// 0. Prepration
Tensor
in_hat
;
DenseTensor
in_hat
;
framework
::
TensorCopy
(
in
,
context
.
GetPlace
(),
&
in_hat
);
phi
::
Copy
(
context
,
in
,
context
.
GetPlace
(),
false
,
&
in_hat
);
auto
in_data_hat
=
in_hat
.
mutable_data
<
InT
>
(
context
.
GetPlace
());
auto
*
in_data_hat
=
context
.
template
Alloc
<
InT
>(
&
in_hat
);
Tensor
*
sorted_indices
=
context
.
Output
<
Tensor
>
(
"Indices"
);
indices
->
Resize
(
phi
::
make_ddim
({
num_input
}));
sorted_indices
->
Resize
(
phi
::
make_ddim
({
num_input
}));
auto
*
indices_data
=
context
.
template
Alloc
<
IndexT
>(
indices
);
auto
sorted_indices_data
=
sorted_indices
->
mutable_data
<
IndexT
>
(
context
.
GetPlace
());
thrust
::
sequence
(
thrust
::
device
,
indices_data
,
indices_data
+
num_input
);
thrust
::
sequence
(
thrust
::
device
,
sorted_indices_data
,
thrust
::
sort_by_key
(
sorted_indices_data
+
num_input
);
thrust
::
device
,
in_data_hat
,
in_data_hat
+
num_input
,
indices_data
);
thrust
::
sort_by_key
(
thrust
::
device
,
in_data_hat
,
in_data_hat
+
num_input
,
sorted_indices_data
);
// 1. Calculate op result: 'out'
// 1. Calculate op result: 'out'
Tensor
range
;
Dense
Tensor
range
;
range
.
Resize
(
phi
::
make_ddim
({
num_input
+
1
}));
range
.
Resize
(
phi
::
make_ddim
({
num_input
+
1
}));
auto
range_data_ptr
=
range
.
mutable_data
<
IndexT
>
(
context
.
GetPlace
()
);
auto
*
range_data_ptr
=
context
.
template
Alloc
<
IndexT
>(
&
range
);
thrust
::
sequence
(
thrust
::
device
,
range_data_ptr
,
thrust
::
sequence
(
range_data_ptr
+
num_input
+
1
);
thrust
::
device
,
range_data_ptr
,
range_data_ptr
+
num_input
+
1
);
framework
::
TensorCopy
(
in_hat
,
context
.
GetPlace
()
,
out
);
phi
::
Copy
(
context
,
in_hat
,
context
.
GetPlace
(),
false
,
out
);
int
num_out
;
int
num_out
;
auto
out_data
=
out
->
mutable_data
<
InT
>
(
context
.
GetPlace
());
auto
out_data
=
context
.
template
Alloc
<
InT
>(
out
);
num_out
=
thrust
::
unique_by_key
(
thrust
::
device
,
out_data
,
num_out
=
out_data
+
num_input
,
range_data_ptr
,
equal
)
thrust
::
unique_by_key
(
.
first
-
thrust
::
device
,
out_data
,
out_data
+
num_input
,
range_data_ptr
,
equal
)
out_data
;
.
first
-
out_data
;
out
->
Resize
(
phi
::
make_ddim
({
num_out
}));
out
->
Resize
(
phi
::
make_ddim
({
num_out
}));
// 3. Calculate inverse index: 'inverse'
// 3. Calculate inverse index: 'inverse'
if
(
return_inverse
)
{
if
(
return_inverse
)
{
Tensor
*
inverse
=
context
.
Output
<
Tensor
>
(
"Index"
);
index
->
Resize
(
phi
::
make_ddim
({
num_input
}));
inverse
->
Resize
(
phi
::
make_ddim
({
num_input
}));
auto
*
inverse_data
=
context
.
template
Alloc
<
IndexT
>(
index
);
auto
inverse_data
=
inverse
->
mutable_data
<
IndexT
>
(
context
.
GetPlace
());
DenseTensor
inv_loc
;
Tensor
inv_loc
;
inv_loc
.
Resize
(
phi
::
make_ddim
({
num_input
}));
inv_loc
.
Resize
(
phi
::
make_ddim
({
num_input
}));
auto
inv_loc_data_ptr
=
inv_loc
.
mutable_data
<
IndexT
>
(
context
.
GetPlace
());
auto
inv_loc_data_ptr
=
context
.
template
Alloc
<
IndexT
>(
&
inv_loc
);
thrust
::
adjacent_difference
(
thrust
::
device
,
in_data_hat
,
thrust
::
adjacent_difference
(
thrust
::
device
,
in_data_hat
+
num_input
,
inv_loc_data_ptr
,
in_data_hat
,
in_data_hat
+
num_input
,
inv_loc_data_ptr
,
not_equal
);
not_equal
);
thrust
::
device_ptr
<
IndexT
>
inv_loc_data_dev
(
inv_loc_data_ptr
);
thrust
::
device_ptr
<
IndexT
>
inv_loc_data_dev
(
inv_loc_data_ptr
);
inv_loc_data_dev
[
0
]
=
0
;
// without device_ptr, segmentation fault
inv_loc_data_dev
[
0
]
=
0
;
// without device_ptr, segmentation fault
thrust
::
inclusive_scan
(
thrust
::
device
,
inv_loc_data_ptr
,
thrust
::
inclusive_scan
(
thrust
::
device
,
inv_loc_data_ptr
+
num_input
,
inv_loc_data_ptr
);
inv_loc_data_ptr
,
thrust
::
scatter
(
thrust
::
device
,
inv_loc_data_ptr
,
inv_loc_data_ptr
+
num_input
,
inv_loc_data_ptr
+
num_input
,
sorted_indices_data
,
inv_loc_data_ptr
);
thrust
::
scatter
(
thrust
::
device
,
inv_loc_data_ptr
,
inv_loc_data_ptr
+
num_input
,
indices_data
,
inverse_data
);
inverse_data
);
}
}
// 2. Calculate sorted index: '
sorted_
indices'
// 2. Calculate sorted index: 'indices'
if
(
return_index
)
{
if
(
return_index
)
{
Tensor
indices
;
DenseTensor
tmp_indices
;
indices
.
Resize
(
phi
::
make_ddim
({
num_input
}));
tmp_indices
.
Resize
(
phi
::
make_ddim
({
num_input
}));
auto
indices_data_ptr
=
indices
.
mutable_data
<
IndexT
>
(
context
.
GetPlace
());
auto
*
tmp_indices_data_ptr
=
context
.
template
Alloc
<
IndexT
>(
&
tmp_indices
);
thrust
::
copy
(
thrust
::
device
,
in_data_hat
,
in_data_hat
+
num_input
,
thrust
::
copy
(
thrust
::
device
,
indices_data_ptr
);
in_data_hat
,
thrust
::
unique_by_key
(
thrust
::
device
,
indices_data_ptr
,
in_data_hat
+
num_input
,
indices_data_ptr
+
num_input
,
sorted_indices_data
,
tmp_indices_data_ptr
);
thrust
::
unique_by_key
(
thrust
::
device
,
tmp_indices_data_ptr
,
tmp_indices_data_ptr
+
num_input
,
indices_data
,
equal
);
equal
);
sorted_
indices
->
Resize
(
phi
::
make_ddim
({
num_out
}));
indices
->
Resize
(
phi
::
make_ddim
({
num_out
}));
}
}
// 4. Calculate 'counts'
// 4. Calculate 'counts'
if
(
return_counts
)
{
if
(
return_counts
)
{
Tensor
*
counts
=
context
.
Output
<
Tensor
>
(
"Counts"
);
counts
->
Resize
(
phi
::
make_ddim
({
num_out
}));
counts
->
Resize
(
phi
::
make_ddim
({
num_out
}));
auto
count_data
=
co
unts
->
mutable_data
<
IndexT
>
(
context
.
GetPlace
()
);
auto
count_data
=
co
ntext
.
template
Alloc
<
IndexT
>(
counts
);
// init 'count_data' as 0
// init 'count_data' as 0
thrust
::
fill
(
thrust
::
device
,
count_data
,
count_data
+
num_out
,
0
);
thrust
::
fill
(
thrust
::
device
,
count_data
,
count_data
+
num_out
,
0
);
thrust
::
device_ptr
<
IndexT
>
range_data_ptr_dev
(
range_data_ptr
);
thrust
::
device_ptr
<
IndexT
>
range_data_ptr_dev
(
range_data_ptr
);
range_data_ptr_dev
[
num_out
]
=
num_input
;
range_data_ptr_dev
[
num_out
]
=
num_input
;
thrust
::
adjacent_difference
(
thrust
::
device
,
range_data_ptr
+
1
,
thrust
::
adjacent_difference
(
thrust
::
device
,
range_data_ptr
+
num_out
+
1
,
count_data
);
range_data_ptr
+
1
,
range_data_ptr
+
num_out
+
1
,
count_data
);
}
}
}
}
// The logic of compute unique with axis required, it's a little different
// The logic of compute unique with axis required, it's a little different
// from above function
// from above function
template
<
typename
InT
,
typename
IndexT
,
typename
equal_T
,
typename
not_equal_T
>
template
<
typename
Context
,
static
void
ComputeUniqueDims
(
const
framework
::
ExecutionContext
&
context
,
typename
InT
,
Tensor
*
sorted_indices
,
typename
IndexT
,
IndexT
*
sorted_indices_data
,
Tensor
*
out
,
typename
equal_T
,
bool
return_index
,
bool
return_inverse
,
typename
not_equal_T
>
bool
return_counts
,
equal_T
equal
,
static
void
ComputeUniqueDims
(
const
Context
&
context
,
not_equal_T
not_equal
,
int64_t
row
)
{
DenseTensor
*
sorted_indices
,
IndexT
*
sorted_indices_data
,
DenseTensor
*
out
,
DenseTensor
*
inverse
,
DenseTensor
*
counts
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
equal_T
equal
,
not_equal_T
not_equal
,
int64_t
row
)
{
// 1. inverse indices: 'inverse'
// 1. inverse indices: 'inverse'
Tensor
*
inverse
=
context
.
Output
<
Tensor
>
(
"Index"
);
inverse
->
Resize
(
phi
::
make_ddim
({
row
}));
inverse
->
Resize
(
phi
::
make_ddim
({
row
}));
auto
inverse_data
=
inverse
->
mutable_data
<
IndexT
>
(
context
.
GetPlace
()
);
auto
*
inverse_data
=
context
.
template
Alloc
<
IndexT
>(
inverse
);
Tensor
inv_loc
;
Dense
Tensor
inv_loc
;
inv_loc
.
Resize
(
phi
::
make_ddim
({
row
}));
inv_loc
.
Resize
(
phi
::
make_ddim
({
row
}));
auto
inv_loc_data_ptr
=
inv_loc
.
mutable_data
<
IndexT
>
(
context
.
GetPlace
());
auto
inv_loc_data_ptr
=
context
.
template
Alloc
<
IndexT
>(
&
inv_loc
);
thrust
::
adjacent_difference
(
thrust
::
device
,
sorted_indices_data
,
thrust
::
adjacent_difference
(
thrust
::
device
,
sorted_indices_data
+
row
,
inv_loc_data_ptr
,
sorted_indices_data
,
sorted_indices_data
+
row
,
inv_loc_data_ptr
,
not_equal
);
not_equal
);
thrust
::
device_ptr
<
IndexT
>
inv_loc_data_dev
(
inv_loc_data_ptr
);
thrust
::
device_ptr
<
IndexT
>
inv_loc_data_dev
(
inv_loc_data_ptr
);
inv_loc_data_dev
[
0
]
=
0
;
inv_loc_data_dev
[
0
]
=
0
;
thrust
::
inclusive_scan
(
thrust
::
device
,
inv_loc_data_ptr
,
thrust
::
inclusive_scan
(
thrust
::
device
,
inv_loc_data_ptr
+
row
,
inv_loc_data_ptr
);
inv_loc_data_ptr
,
thrust
::
scatter
(
thrust
::
device
,
inv_loc_data_ptr
,
inv_loc_data_ptr
+
row
,
inv_loc_data_ptr
+
row
,
sorted_indices_data
,
inverse_data
);
inv_loc_data_ptr
);
thrust
::
scatter
(
thrust
::
device
,
inv_loc_data_ptr
,
inv_loc_data_ptr
+
row
,
sorted_indices_data
,
inverse_data
);
// 2. sorted indices
// 2. sorted indices
Tensor
range
;
Dense
Tensor
range
;
range
.
Resize
(
phi
::
make_ddim
({
row
+
1
}));
range
.
Resize
(
phi
::
make_ddim
({
row
+
1
}));
auto
range_data_ptr
=
range
.
mutable_data
<
IndexT
>
(
context
.
GetPlace
()
);
auto
range_data_ptr
=
context
.
template
Alloc
<
IndexT
>(
&
range
);
thrust
::
sequence
(
thrust
::
device
,
range_data_ptr
,
range_data_ptr
+
row
+
1
);
thrust
::
sequence
(
thrust
::
device
,
range_data_ptr
,
range_data_ptr
+
row
+
1
);
int
num_out
;
int
num_out
;
num_out
=
num_out
=
thrust
::
unique_by_key
(
thrust
::
device
,
thrust
::
unique_by_key
(
thrust
::
device
,
sorted_indices_data
,
sorted_indices_data
,
sorted_indices_data
+
row
,
range_data_ptr
,
equal
)
sorted_indices_data
+
row
,
.
first
-
range_data_ptr
,
sorted_indices_data
;
equal
)
.
first
-
sorted_indices_data
;
thrust
::
device_ptr
<
IndexT
>
range_data_ptr_dev
(
range_data_ptr
);
thrust
::
device_ptr
<
IndexT
>
range_data_ptr_dev
(
range_data_ptr
);
range_data_ptr_dev
[
num_out
]
=
row
;
range_data_ptr_dev
[
num_out
]
=
row
;
sorted_indices
->
Resize
(
phi
::
make_ddim
({
num_out
}));
sorted_indices
->
Resize
(
phi
::
make_ddim
({
num_out
}));
// 3. counts: 'counts'
// 3. counts: 'counts'
Tensor
*
counts
=
context
.
Output
<
Tensor
>
(
"Counts"
);
counts
->
Resize
(
phi
::
make_ddim
({
num_out
}));
counts
->
Resize
(
phi
::
make_ddim
({
num_out
}));
auto
count_data
=
counts
->
mutable_data
<
IndexT
>
(
context
.
GetPlace
()
);
auto
*
count_data
=
context
.
template
Alloc
<
IndexT
>(
counts
);
thrust
::
fill
(
thrust
::
device
,
count_data
,
count_data
+
row
,
0
);
thrust
::
fill
(
thrust
::
device
,
count_data
,
count_data
+
row
,
0
);
thrust
::
adjacent_difference
(
thrust
::
device
,
range_data_ptr
+
1
,
thrust
::
adjacent_difference
(
range_data_ptr
+
row
+
1
,
count_data
);
thrust
::
device
,
range_data_ptr
+
1
,
range_data_ptr
+
row
+
1
,
count_data
);
}
}
// Calculate unique when 'axis' is set
// Calculate unique when 'axis' is set
template
<
typename
DeviceContext
,
typename
InT
,
typename
IndexT
>
template
<
typename
Context
,
typename
InT
,
typename
IndexT
>
static
void
UniqueDimsCUDATensor
(
const
framework
::
ExecutionContext
&
context
,
static
void
UniqueDimsCUDATensor
(
const
Context
&
context
,
const
Tensor
&
in
,
Tensor
*
out
,
const
DenseTensor
&
in
,
bool
return_index
,
bool
return_inverse
,
DenseTensor
*
out
,
bool
return_counts
,
int
axis
)
{
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
int
axis
)
{
// 1. Transpose & reshape
// 1. Transpose & reshape
// Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
// Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
std
::
vector
<
int
>
permute
(
in
.
dims
().
size
());
std
::
vector
<
int
>
permute
(
in
.
dims
().
size
());
...
@@ -307,19 +357,19 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context,
...
@@ -307,19 +357,19 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context,
std
::
vector
<
int64_t
>
in_trans_dims_vec
(
phi
::
vectorize
(
in
.
dims
()));
std
::
vector
<
int64_t
>
in_trans_dims_vec
(
phi
::
vectorize
(
in
.
dims
()));
in_trans_dims_vec
[
axis
]
=
in
.
dims
()[
0
];
in_trans_dims_vec
[
axis
]
=
in
.
dims
()[
0
];
in_trans_dims_vec
[
0
]
=
in
.
dims
()[
axis
];
in_trans_dims_vec
[
0
]
=
in
.
dims
()[
axis
];
framework
::
Tensor
in_trans
;
Dense
Tensor
in_trans
;
framework
::
DDim
in_trans_dims
=
phi
::
make_ddim
(
in_trans_dims_vec
);
auto
in_trans_dims
=
phi
::
make_ddim
(
in_trans_dims_vec
);
in_trans
.
Resize
(
in_trans_dims
);
in_trans
.
Resize
(
in_trans_dims
);
in_trans
.
mutable_data
<
InT
>
(
context
.
GetPlace
()
);
context
.
template
Alloc
<
InT
>(
&
in_trans
);
auto
&
dev_ctx
=
context
.
cuda_device_context
();
phi
::
funcs
::
TransCompute
<
Context
,
InT
>
(
TransCompute
<
DeviceContext
,
InT
>
(
in
.
dims
().
size
(),
// num of dims
in
.
dims
().
size
(),
// num of dims
dev_ctx
,
// device
context
,
// device
in
,
// original
Tensor
in
,
// original Dense
Tensor
&
in_trans
,
//
Tensor after reshape
&
in_trans
,
// Dense
Tensor after reshape
permute
);
// index of axis
permute
);
// index of axis
// Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
// Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
framework
::
DDim
in_trans_flat_dims
=
phi
::
flatten_to_2d
(
in_trans_dims
,
1
);
auto
in_trans_flat_dims
=
phi
::
flatten_to_2d
(
in_trans_dims
,
1
);
in_trans
.
Resize
(
in_trans_flat_dims
);
in_trans
.
Resize
(
in_trans_flat_dims
);
// now 'in_trans' is 2D
// now 'in_trans' is 2D
...
@@ -327,87 +377,129 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context,
...
@@ -327,87 +377,129 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context,
int64_t
row
=
in_trans
.
dims
()[
0
];
int64_t
row
=
in_trans
.
dims
()[
0
];
const
InT
*
in_trans_data
=
in_trans
.
data
<
InT
>
();
const
InT
*
in_trans_data
=
in_trans
.
data
<
InT
>
();
Tensor
*
sorted_indices
=
context
.
Output
<
Tensor
>
(
"Indices"
);
indices
->
Resize
(
phi
::
make_ddim
({
row
}));
sorted_indices
->
Resize
(
phi
::
make_ddim
({
row
}));
auto
*
sorted_indices_data
=
context
.
template
Alloc
<
IndexT
>(
indices
);
auto
sorted_indices_data
=
sorted_indices
->
mutable_data
<
IndexT
>
(
context
.
GetPlace
());
// 2. Calculate '
sorted_
indices', 'inverse', 'counts'
// 2. Calculate 'indices', 'inverse', 'counts'
// Init index and sort
// Init index and sort
thrust
::
sequence
(
thrust
::
device
,
sorted_indices_data
,
thrust
::
sequence
(
sorted_indices_data
+
row
);
thrust
::
device
,
sorted_indices_data
,
sorted_indices_data
+
row
);
thrust
::
sort
(
thrust
::
device
,
sorted_indices_data
,
sorted_indices_data
+
row
,
thrust
::
sort
(
thrust
::
device
,
sorted_indices_data
,
sorted_indices_data
+
row
,
LessThan
<
InT
>
(
col
,
in_trans_data
));
LessThan
<
InT
>
(
col
,
in_trans_data
));
ComputeUniqueDims
<
InT
,
IndexT
>
(
ComputeUniqueDims
<
Context
,
InT
,
IndexT
>
(
context
,
sorted_indices
,
sorted_indices_data
,
out
,
return_index
,
context
,
return_inverse
,
return_counts
,
BinaryEqual
<
InT
>
(
col
,
in_trans_data
),
indices
,
BinaryNotEqual
<
InT
>
(
col
,
in_trans_data
),
row
);
sorted_indices_data
,
out
,
index
,
counts
,
return_index
,
return_inverse
,
return_counts
,
BinaryEqual
<
InT
>
(
col
,
in_trans_data
),
BinaryNotEqual
<
InT
>
(
col
,
in_trans_data
),
row
);
// 3. Select indices and reshape back to get 'out'
// 3. Select indices and reshape back to get 'out'
Tensor
out_trans
;
Dense
Tensor
out_trans
;
std
::
vector
<
int64_t
>
out_trans_dims_vec
=
in_trans_dims_vec
;
std
::
vector
<
int64_t
>
out_trans_dims_vec
=
in_trans_dims_vec
;
out_trans_dims_vec
[
0
]
=
sorted_
indices
->
numel
();
out_trans_dims_vec
[
0
]
=
indices
->
numel
();
out_trans
.
Resize
(
phi
::
make_ddim
(
out_trans_dims_vec
));
out_trans
.
Resize
(
phi
::
make_ddim
(
out_trans_dims_vec
));
out_trans
.
mutable_data
<
InT
>
(
context
.
GetPlace
()
);
context
.
template
Alloc
<
InT
>(
&
out_trans
);
IndexSelect
<
InT
,
IndexT
>
(
context
,
in_trans
,
*
sorted_
indices
,
&
out_trans
,
0
);
IndexSelect
<
Context
,
InT
,
IndexT
>
(
context
,
in_trans
,
*
indices
,
&
out_trans
,
0
);
std
::
swap
(
out_trans_dims_vec
[
0
],
out_trans_dims_vec
[
axis
]);
std
::
swap
(
out_trans_dims_vec
[
0
],
out_trans_dims_vec
[
axis
]);
out
->
Resize
(
phi
::
make_ddim
(
out_trans_dims_vec
));
out
->
Resize
(
phi
::
make_ddim
(
out_trans_dims_vec
));
out
->
mutable_data
<
InT
>
(
context
.
GetPlace
()
);
context
.
template
Alloc
<
InT
>(
out
);
std
::
vector
<
framework
::
Tensor
>
out_trans_unbind
=
Unbind
(
out_trans
);
std
::
vector
<
DenseTensor
>
out_trans_unbind
=
phi
::
funcs
::
Unbind
(
out_trans
);
math
::
ConcatFunctor
<
Device
Context
,
InT
>
concat_functor
;
phi
::
funcs
::
ConcatFunctor
<
Context
,
InT
>
concat_functor
;
concat_functor
(
dev_ctx
,
out_trans_unbind
,
0
,
&
out_trans
);
concat_functor
(
context
,
out_trans_unbind
,
0
,
&
out_trans
);
TransCompute
<
DeviceContext
,
InT
>
(
out_trans
.
dims
().
size
(),
dev_ctx
,
out_trans
,
phi
::
funcs
::
TransCompute
<
Context
,
InT
>
(
out
,
permute
);
out_trans
.
dims
().
size
(),
context
,
out_trans
,
out
,
permute
);
}
}
// functor for processing a flattend Tensor
// functor for processing a flattend
Dense
Tensor
template
<
typename
Device
Context
,
typename
InT
>
template
<
typename
Context
,
typename
InT
>
struct
UniqueFlattendCUDAFunctor
{
struct
UniqueFlattendCUDAFunctor
{
const
framework
::
ExecutionContext
&
ctx_
;
const
Context
&
ctx_
;
const
Tensor
&
in_
;
const
DenseTensor
&
in_
;
Tensor
*
out_
;
DenseTensor
*
out_
;
DenseTensor
*
indices_
;
DenseTensor
*
index_
;
DenseTensor
*
counts_
;
const
bool
return_index_
;
const
bool
return_index_
;
const
bool
return_inverse_
;
const
bool
return_inverse_
;
const
bool
return_counts_
;
const
bool
return_counts_
;
UniqueFlattendCUDAFunctor
(
const
framework
::
ExecutionContext
&
context
,
UniqueFlattendCUDAFunctor
(
const
Context
&
context
,
const
Tensor
&
in
,
Tensor
*
out
,
bool
return_index
,
const
DenseTensor
&
in
,
bool
return_inverse
,
bool
return_counts
)
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
)
:
ctx_
(
context
),
:
ctx_
(
context
),
in_
(
in
),
in_
(
in
),
out_
(
out
),
out_
(
out
),
indices_
(
indices
),
index_
(
index
),
counts_
(
counts
),
return_index_
(
return_index
),
return_index_
(
return_index
),
return_inverse_
(
return_inverse
),
return_inverse_
(
return_inverse
),
return_counts_
(
return_counts
)
{}
return_counts_
(
return_counts
)
{}
template
<
typename
IndexT
>
template
<
typename
IndexT
>
void
apply
()
const
{
void
apply
()
const
{
UniqueFlattendCUDATensor
<
InT
,
IndexT
>
(
UniqueFlattendCUDATensor
<
Context
,
InT
,
IndexT
>
(
ctx_
,
ctx_
,
in_
,
out_
,
return_index_
,
return_inverse_
,
return_counts_
,
in_
,
thrust
::
equal_to
<
InT
>
(),
thrust
::
not_equal_to
<
InT
>
(),
in_
.
numel
());
out_
,
indices_
,
index_
,
counts_
,
return_index_
,
return_inverse_
,
return_counts_
,
thrust
::
equal_to
<
InT
>
(),
thrust
::
not_equal_to
<
InT
>
(),
in_
.
numel
());
}
}
};
};
// functor for processing a multi-dimentional Tensor
// functor for processing a multi-dimentional
Dense
Tensor
template
<
typename
Device
Context
,
typename
InT
>
template
<
typename
Context
,
typename
InT
>
struct
UniqueDimsCUDAFunctor
{
struct
UniqueDimsCUDAFunctor
{
const
framework
::
ExecutionContext
&
ctx_
;
const
Context
&
ctx_
;
const
Tensor
&
in_
;
const
DenseTensor
&
in_
;
Tensor
*
out_
;
DenseTensor
*
out_
;
DenseTensor
*
indices_
;
DenseTensor
*
index_
;
DenseTensor
*
counts_
;
const
int
axis_
;
const
int
axis_
;
const
bool
return_index_
;
const
bool
return_index_
;
const
bool
return_inverse_
;
const
bool
return_inverse_
;
const
bool
return_counts_
;
const
bool
return_counts_
;
UniqueDimsCUDAFunctor
(
const
framework
::
ExecutionContext
&
context
,
UniqueDimsCUDAFunctor
(
const
Context
&
context
,
const
Tensor
&
in
,
Tensor
*
out
,
const
int
axis
,
const
DenseTensor
&
in
,
bool
return_index
,
bool
return_inverse
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
,
const
int
axis
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
)
bool
return_counts
)
:
ctx_
(
context
),
:
ctx_
(
context
),
in_
(
in
),
in_
(
in
),
out_
(
out
),
out_
(
out
),
indices_
(
indices
),
index_
(
index
),
counts_
(
counts
),
axis_
(
axis
),
axis_
(
axis
),
return_index_
(
return_index
),
return_index_
(
return_index
),
return_inverse_
(
return_inverse
),
return_inverse_
(
return_inverse
),
...
@@ -415,60 +507,109 @@ struct UniqueDimsCUDAFunctor {
...
@@ -415,60 +507,109 @@ struct UniqueDimsCUDAFunctor {
template
<
typename
IndexT
>
template
<
typename
IndexT
>
void
apply
()
const
{
void
apply
()
const
{
UniqueDimsCUDATensor
<
DeviceContext
,
InT
,
IndexT
>
(
UniqueDimsCUDATensor
<
Context
,
InT
,
IndexT
>
(
ctx_
,
ctx_
,
in_
,
out_
,
return_index_
,
return_inverse_
,
return_counts_
,
axis_
);
in_
,
out_
,
indices_
,
index_
,
counts_
,
return_index_
,
return_inverse_
,
return_counts_
,
axis_
);
}
}
};
};
// Unique_op CUDA implementation.
template
<
typename
T
,
typename
Context
>
template
<
typename
InT
>
void
UniqueRawKernel
(
const
Context
&
context
,
class
UniqueKernel
<
platform
::
CUDADeviceContext
,
InT
>
const
DenseTensor
&
x
,
:
public
framework
::
OpKernel
<
InT
>
{
bool
return_index
,
public:
bool
return_inverse
,
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
return_counts
,
auto
*
x
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
const
std
::
vector
<
int
>&
axis
,
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
DataType
dtype
,
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
bool
is_sorted
,
context
.
Attr
<
int
>
(
"dtype"
));
DenseTensor
*
out
,
if
(
data_type
==
framework
::
proto
::
VarType
::
INT32
)
{
DenseTensor
*
indices
,
PADDLE_ENFORCE_LE
(
DenseTensor
*
index
,
x
->
numel
()
+
1
,
INT_MAX
,
DenseTensor
*
counts
)
{
platform
::
errors
::
InvalidArgument
(
if
(
dtype
==
phi
::
DataType
::
INT32
)
{
"The number of elements in Input(X) should be less than or "
PADDLE_ENFORCE_LE
(
"equal to INT_MAX, but received num is %d. Please set `dtype` to "
x
.
numel
()
+
1
,
"int64."
,
INT_MAX
,
x
->
numel
()));
phi
::
errors
::
InvalidArgument
(
}
"The number of elements in Input(X) should be less than or "
"equal to INT_MAX, but received num is %d. Please set `dtype` to "
std
::
vector
<
int
>
axis_vec
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"axis"
);
"int64."
,
bool
return_index
=
context
.
Attr
<
bool
>
(
"return_index"
);
x
.
numel
()));
bool
return_inverse
=
context
.
Attr
<
bool
>
(
"return_inverse"
);
bool
return_counts
=
context
.
Attr
<
bool
>
(
"return_counts"
);
// if 'axis' is not required, flatten the Tensor.
if
(
axis_vec
.
empty
())
{
framework
::
VisitDataTypeTiny
(
data_type
,
UniqueFlattendCUDAFunctor
<
platform
::
CUDADeviceContext
,
InT
>
(
context
,
*
x
,
out
,
return_index
,
return_inverse
,
return_counts
));
}
else
{
// 'axis' is required.
int
axis
=
axis_vec
[
0
];
framework
::
VisitDataTypeTiny
(
data_type
,
UniqueDimsCUDAFunctor
<
platform
::
CUDADeviceContext
,
InT
>
(
context
,
*
x
,
out
,
axis
,
return_index
,
return_inverse
,
return_counts
));
}
}
}
};
// if 'axis' is not required, flatten the DenseTensor.
if
(
axis
.
empty
())
{
phi
::
VisitDataTypeTiny
(
dtype
,
UniqueFlattendCUDAFunctor
<
Context
,
T
>
(
context
,
x
,
out
,
indices
,
index
,
counts
,
return_index
,
return_inverse
,
return_counts
));
}
else
{
// 'axis' is required.
int
axis_value
=
axis
[
0
];
phi
::
VisitDataTypeTiny
(
dtype
,
UniqueDimsCUDAFunctor
<
Context
,
T
>
(
context
,
x
,
out
,
indices
,
index
,
counts
,
axis_value
,
return_index
,
return_inverse
,
return_counts
));
}
}
template
<
typename
T
,
typename
Context
>
void
UniqueKernel
(
const
Context
&
context
,
const
DenseTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
)
{
bool
is_sorted
=
true
;
UniqueRawKernel
<
T
,
Context
>
(
context
,
x
,
return_index
,
return_inverse
,
return_counts
,
axis
,
dtype
,
is_sorted
,
out
,
indices
,
index
,
counts
);
}
}
// namespace operators
}
// namespace phi
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
PD_REGISTER_KERNEL
(
unique
,
GPU
,
ALL_LAYOUT
,
phi
::
UniqueKernel
,
float
,
double
,
int64_t
,
int
)
{}
REGISTER_OP_CUDA_KERNEL
(
PD_REGISTER_KERNEL
(
unique_raw
,
unique
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
GPU
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ALL_LAYOUT
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int32_t
>
,
phi
::
UniqueRawKernel
,
ops
::
UniqueKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
float
,
double
,
int64_t
,
int
)
{}
paddle/phi/kernels/gpu/unstack_grad_kernel.cu
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unstack_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
PD_REGISTER_KERNEL
(
unstack_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
UnStackGradKernel
,
float
,
double
,
int64_t
,
int
,
phi
::
dtype
::
float16
)
{}
paddle/phi/kernels/gpu/unstack_kernel.cu
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/unstack_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
PD_REGISTER_KERNEL
(
unstack
,
GPU
,
ALL_LAYOUT
,
phi
::
UnStackKernel
,
float
,
double
,
int64_t
,
int
,
phi
::
dtype
::
float16
)
{}
paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/stack_functor.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#endif
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
UnStackGradKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
x
,
int
axis
,
DenseTensor
*
x_grad
)
{
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
n
=
static_cast
<
int
>
(
x
.
size
());
auto
*
x_grad_data
=
dev_ctx
.
template
Alloc
<
T
>(
x_grad
);
std
::
vector
<
const
T
*>
x_datas
(
n
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
x_datas
[
i
]
=
x
[
i
]
->
data
<
T
>
();
int
pre
=
1
;
int
post
=
1
;
auto
&
dim
=
x
[
0
]
->
dims
();
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
#if defined(__NVCC__) || defined(__HIPCC__)
int
total_num
=
pre
*
n
*
post
;
thrust
::
device_vector
<
const
T
*>
device_x_vec
(
x_datas
);
auto
x_data_arr
=
device_x_vec
.
data
().
get
();
phi
::
funcs
::
StackFunctorForRange
(
dev_ctx
,
x_data_arr
,
x_grad_data
,
total_num
,
n
,
post
);
// Wait() must be called because device_x_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
#else
auto
x_data_arr
=
x_datas
.
data
();
size_t
x_offset
=
0
;
size_t
y_offset
=
0
;
for
(
int
i
=
0
;
i
<
pre
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
std
::
memcpy
(
x_grad_data
+
y_offset
,
x_data_arr
[
j
]
+
x_offset
,
post
*
sizeof
(
T
));
y_offset
+=
post
;
}
x_offset
+=
post
;
}
#endif
}
}
// namespace phi
paddle/phi/kernels/impl/unstack_kernel_impl.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/stack_functor.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#endif
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
UnStackKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
int
axis
,
int
num
,
std
::
vector
<
DenseTensor
*>
outs
)
{
auto
*
dy
=
&
x
;
auto
dx
=
outs
;
if
(
axis
<
0
)
axis
+=
dy
->
dims
().
size
();
int
n
=
dy
->
dims
()[
axis
];
std
::
vector
<
T
*>
dx_datas
(
n
);
// NOLINT
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
dx_datas
[
i
]
=
dev_ctx
.
template
Alloc
<
T
>(
dx
[
i
]);
}
auto
dy_data
=
dy
->
data
<
T
>
();
if
(
dy
->
numel
()
==
0
)
return
;
int
pre
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dy
->
dims
()[
i
];
int
total_num
=
dy
->
numel
();
int
post
=
total_num
/
(
n
*
pre
);
#if defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
T
*>
device_dx_vec
(
dx_datas
);
auto
dx_data_arr
=
device_dx_vec
.
data
().
get
();
#else
auto
dx_data_arr
=
dx_datas
.
data
();
#endif
phi
::
funcs
::
StackGradFunctorForRange
(
dev_ctx
,
dx_data_arr
,
dy_data
,
total_num
,
n
,
post
);
#if defined(__NVCC__) || defined(__HIPCC__)
// Wait() must be called because device_dx_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
#endif
}
}
// namespace phi
paddle/phi/kernels/range_kernel.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
RangeKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
start
,
const
DenseTensor
&
end
,
const
DenseTensor
&
step
,
DenseTensor
*
out
);
}
// namespace phi
paddle/phi/kernels/stack_grad_kernel.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
StackGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
int
axis
,
std
::
vector
<
DenseTensor
*>
x_grad
);
}
// namespace phi
paddle/phi/kernels/stack_kernel.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
StackKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
DenseTensor
*>&
x
,
int
axis
,
DenseTensor
*
out
);
}
// namespace phi
paddle/phi/kernels/unique_kernel.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
UniqueKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
);
template
<
typename
T
,
typename
Context
>
void
UniqueRawKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
bool
return_index
,
bool
return_inverse
,
bool
return_counts
,
const
std
::
vector
<
int
>&
axis
,
DataType
dtype
,
bool
is_sorted
,
DenseTensor
*
out
,
DenseTensor
*
indices
,
DenseTensor
*
index
,
DenseTensor
*
counts
);
}
// namespace phi
paddle/phi/kernels/unstack_grad_kernel.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
UnStackGradKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
DenseTensor
*>&
x
,
int
axis
,
DenseTensor
*
x_grad
);
}
// namespace phi
paddle/phi/kernels/unstack_kernel.h
0 → 100644
浏览文件 @
74894cd7
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
UnStackKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
int
axis
,
int
num
,
std
::
vector
<
DenseTensor
*>
outs
);
}
// namespace phi
paddle/phi/ops/compat/stack_sig.cc
0 → 100644
浏览文件 @
74894cd7
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace
phi
{
KernelSignature
StackGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"stack_grad"
,
{
GradVarName
(
"Y"
)},
{
"axis"
},
{
GradVarName
(
"X"
)});
}
}
// namespace phi
PD_REGISTER_ARG_MAPPING_FN
(
stack_grad
,
phi
::
StackGradOpArgumentMapping
);
paddle/phi/ops/compat/unique_sig.cc
0 → 100644
浏览文件 @
74894cd7
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace
phi
{
KernelSignature
UniqueOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
bool
is_sorted
=
paddle
::
any_cast
<
bool
>
(
ctx
.
Attr
(
"is_sorted"
));
if
(
is_sorted
)
{
return
KernelSignature
(
"unique"
,
{
"X"
},
{
"return_index"
,
"return_inverse"
,
"return_counts"
,
"axis"
,
"dtype"
},
{
"Out"
,
"Indices"
,
"Index"
,
"Counts"
});
}
else
{
return
KernelSignature
(
"unique_raw"
,
{
"X"
},
{
"return_index"
,
"return_inverse"
,
"return_counts"
,
"axis"
,
"dtype"
,
"is_sorted"
},
{
"Out"
,
"Indices"
,
"Index"
,
"Counts"
});
}
}
}
// namespace phi
PD_REGISTER_ARG_MAPPING_FN
(
unique
,
phi
::
UniqueOpArgumentMapping
);
paddle/phi/ops/compat/unstack_sig.cc
0 → 100644
浏览文件 @
74894cd7
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace
phi
{
KernelSignature
UnStackGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"unstack_grad"
,
{
GradVarName
(
"Y"
)},
{
"axis"
},
{
GradVarName
(
"X"
)});
}
}
// namespace phi
PD_REGISTER_ARG_MAPPING_FN
(
unstack_grad
,
phi
::
UnStackGradOpArgumentMapping
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录