未验证 提交 471971a3 编写于 作者: J Jiawei Wang 提交者: GitHub

Add TopK Op Grad CPU&GPU Kernel test=develop (#22628) (#22656)

* Add TopK Op Grad CPU&GPU Kernel test=develop

* Add TopK Op Grad, modify grad op maker test=develop

* Add TopK Op Grad, modify grad op maker test=develop

* Add TopK Op Grad, modify PADDLE_ENFORCE test=develop

* Add TopK Op Grad, modify PADDLE_THROW test=develop

* Add TopK Op Grad, modify unittest test=develop

* fix ngraph top k op unittest test=develop
上级 9e80551d
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/top_k_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -42,11 +43,6 @@ class TopkOp : public framework::OperatorWithKernel {
framework::DDim dims = input_dims;
dims[dims.size() - 1] = k;
// If has K as tensor, set k=-1 as not know real size at this time.
if (ctx->HasInput("K")) {
dims[dims.size() - 1] = -1;
}
ctx->SetOutputDim("Out", dims);
ctx->SetOutputDim("Indices", dims);
ctx->ShareLoD("X", "Out");
......@@ -89,16 +85,67 @@ For matrices, this operator computes the top k entries in each row. )DOC");
}
};
class TopkOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(
ctx->HasInput("X"), true,
platform::errors::InvalidArgument("Input(X) should be not null"));
PADDLE_ENFORCE_EQ(
ctx->HasInput("Indices"), true,
platform::errors::InvalidArgument("Input(Indices) should be not null"));
PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
platform::errors::InvalidArgument(
"Grad Input(Out) should be not null"));
PADDLE_ENFORCE_EQ(
ctx->HasOutput(framework::GradVarName("X")), true,
platform::errors::InvalidArgument("Grad Output(X) should be not null"));
auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
template <typename T>
class TopkGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
std::unique_ptr<T> Apply() const override {
std::unique_ptr<T> op(new T());
op->SetType("top_k_grad");
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetInput("X", this->Input("X"));
op->SetInput("Indices", this->Output("Indices"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
top_k, ops::TopkOp, ops::TopkOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
ops::TopkGradOpMaker<paddle::framework::OpDesc>,
ops::TopkGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(top_k_grad, ops::TopkOpGrad);
REGISTER_OP_CPU_KERNEL(top_k,
ops::TopkKernel<paddle::platform::CPUPlace, float>,
ops::TopkKernel<paddle::platform::CPUPlace, double>,
ops::TopkKernel<paddle::platform::CPUPlace, int>,
ops::TopkKernel<paddle::platform::CPUPlace, int64_t>);
ops::TopkKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(top_k_grad,
ops::TopkGradKernel<paddle::platform::CPUPlace, float>,
ops::TopkGradKernel<paddle::platform::CPUPlace, double>);
......@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdio>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
// set cub base traits in order to handle float16
namespace cub {
template <>
......@@ -300,6 +300,20 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
}
}
template <typename T, int MaxLength, int BlockSize>
__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
size_t rows, size_t cols, size_t k) {
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
x_grad[i * cols + j] = 0;
}
for (size_t j = 0; j < k; ++j) {
size_t idx = indices[i * k + j];
x_grad[i * cols + idx] = out_grad[i * k + j];
}
}
}
inline static int GetDesiredBlockDim(int dim) {
if (dim > 128) {
return 256;
......@@ -478,7 +492,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
template <typename T>
template <typename DeviceContext, typename T>
class TopkOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -540,6 +554,42 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(context.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
auto* x = context.Input<Tensor>("X");
auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<Tensor>("Indices");
auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const int64_t* indices_data = indices->data<int64_t>();
size_t k = indices->dims()[indices->dims().size() - 1];
framework::DDim xdims = x->dims();
const size_t row =
framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
const size_t col = xdims[xdims.size() - 1];
const auto& dev_ctx = context.cuda_device_context();
const int kMaxHeight = 2048;
int gridx = row < kMaxHeight ? row : kMaxHeight;
switch (GetDesiredBlockDim(col)) {
FIXED_BLOCK_DIM(
AssignGrad<T, 5,
kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
x_grad_data, indices_data, out_grad_data, row, col, k));
default:
PADDLE_THROW(
platform::errors::Unavailable("Error occurs when Assign Grad."));
}
}
};
#undef FIXED_BLOCK_DIM_BASE
#undef FIXED_BLOCK_DIM
......@@ -547,8 +597,27 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>,
paddle::operators::TopkOpCUDAKernel<int>,
paddle::operators::TopkOpCUDAKernel<int64_t>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
top_k,
paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
float>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
int>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
int64_t>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(
top_k_grad,
paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
float>,
paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
int>,
paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
int64_t>,
paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
......@@ -94,5 +94,35 @@ class TopkKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class TopkGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<Tensor>("Indices");
auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const int64_t* indices_data = indices->data<int64_t>();
size_t k = indices->dims()[indices->dims().size() - 1];
framework::DDim xdims = x->dims();
const size_t row =
framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
const size_t col = xdims[xdims.size() - 1];
memset(x_grad_data, 0, row * col * sizeof(T));
for (size_t i = 0; i < row; ++i) {
for (size_t j = 0; j < k; ++j) {
size_t idx = indices_data[i * k + j];
x_grad_data[i * col + idx] = out_grad_data[i * k + j];
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -15,7 +15,7 @@ from __future__ import print_function
import unittest, sys
sys.path.append("../")
from test_top_k_op import TestTopkOp, TestTopkOp3d, TestTopkOp2, TestTopkOp3, TestTopkOp4
from test_top_k_op import TestTopkOp
if __name__ == "__main__":
unittest.main()
......@@ -17,6 +17,7 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
class TestTopkOp(OpTest):
......@@ -24,7 +25,7 @@ class TestTopkOp(OpTest):
self.variable_k = False
self.set_args()
self.op_type = "top_k"
self.dtype = np.float32
self.dtype = np.float64
self.init_dtype()
k = self.top_k
......@@ -49,106 +50,14 @@ class TestTopkOp(OpTest):
pass
def set_args(self):
self.row = 32
self.row = 100
self.top_k = 1
def test_check_output(self):
self.check_output()
class TestTopkOpFp16(TestTopkOp):
def init_dtype(self):
self.dtype = np.float16
class TestTopkOp3d(OpTest):
def setUp(self):
self.op_type = "top_k"
k = 1
input = np.random.random((32, 2, 84)).astype("float32")
input_flat_2d = input.reshape(64, 84)
output = np.ndarray((64, k))
indices = np.ndarray((64, k)).astype("int64")
self.inputs = {'X': input}
self.attrs = {'k': k}
for rowid in range(64):
row = input_flat_2d[rowid]
output[rowid] = np.sort(row)[::-1][:k]
indices[rowid] = row.argsort()[::-1][:k]
self.outputs = {
'Out': output.reshape((32, 2, k)),
'Indices': indices.reshape((32, 2, k))
}
def test_check_output(self):
self.check_output()
class TestTopkOp1(OpTest):
def setUp(self):
self.op_type = "top_k"
k = 2
m = 2056
input = np.random.random(m).astype("float32")
output = np.ndarray(k)
indices = np.ndarray(k).astype("int64")
self.inputs = {'X': input}
self.attrs = {'k': k}
row = input
output = -np.sort(-row)[:k]
indices = (-row).argsort()[:k]
self.outputs = {'Out': output, 'Indices': indices}
def test_check_output(self):
self.check_output()
class TestTopkOp2(OpTest):
def setUp(self):
self.op_type = "top_k"
k = 1
m = 2056
input = np.random.random((m, 84)).astype("float32")
output = np.ndarray((m, k))
indices = np.ndarray((m, k)).astype("int64")
self.inputs = {'X': input}
self.attrs = {'k': k}
for rowid in range(m):
row = input[rowid]
output[rowid] = -np.sort(-row)[:k]
indices[rowid] = (-row).argsort()[:k]
self.outputs = {'Out': output, 'Indices': indices}
def test_check_output(self):
self.check_output()
class TestTopkOp3(TestTopkOp):
def set_args(self):
self.row = 2056
self.top_k = 3
class TestTopkOp4(TestTopkOp):
def set_args(self):
self.row = 40000
self.top_k = 1
class TestTopkOp5(TestTopkOp):
def set_args(self):
self.row = 40000
self.top_k = 3
self.variable_k = True
def test_check_grad(self):
self.check_grad(set(['X']), 'Out')
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册