未验证 提交 30bd7e1c 编写于 作者: S ShenLiang 提交者: GitHub

Add rank_attention_op attributes for GPU memory in contrib (#23915)

* optimize rank_attention, test=develop

* use the paddle memory pool, test=develop

* set max size, test=develop

* limit the max size, test=develop

* fix the head of cu, test=develop

* add AsDispensable, test=develop
上级 8d0b0cb4
...@@ -34,6 +34,14 @@ class RankAttentionOp : public framework::OperatorWithKernel { ...@@ -34,6 +34,14 @@ class RankAttentionOp : public framework::OperatorWithKernel {
ctx->HasInput("RankParam"), true, ctx->HasInput("RankParam"), true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(RankParam) of RankAttentionOp should not be null.")); "Input(RankParam) of RankAttentionOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("InsRank"), true,
platform::errors::InvalidArgument(
"Output(InsRank) of RankAttentionOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("InputHelp"), true,
platform::errors::InvalidArgument(
"Output(InputHelp) of RankAttentionOp should not be null."));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->HasOutput("Out"), true, ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -45,12 +53,16 @@ class RankAttentionOp : public framework::OperatorWithKernel { ...@@ -45,12 +53,16 @@ class RankAttentionOp : public framework::OperatorWithKernel {
auto param_dims = ctx->GetInputDim("RankParam"); auto param_dims = ctx->GetInputDim("RankParam");
auto para_col = param_dims[1]; auto para_col = param_dims[1];
auto rank_offset_dims = ctx->GetInputDim("RankOffset"); auto rank_offset_dims = ctx->GetInputDim("RankOffset");
auto x_fea_dim = x_dims[1];
auto block_matrix_row = max_rank * x_fea_dim;
PADDLE_ENFORCE_EQ((rank_offset_dims[1] - 1) / 2, max_rank, PADDLE_ENFORCE_EQ((rank_offset_dims[1] - 1) / 2, max_rank,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(RankOffset) has wrong columns.")); "Input(RankOffset) has wrong columns."));
ctx->SetOutputDim("Out", {ins_num, para_col}); ctx->SetOutputDim("Out", {ins_num, para_col});
ctx->SetOutputDim("InputHelp", {ins_num, block_matrix_row});
ctx->SetOutputDim("InsRank", {ins_num, 1});
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
...@@ -77,6 +89,12 @@ class RankAttentionGradOp : public framework::OperatorWithKernel { ...@@ -77,6 +89,12 @@ class RankAttentionGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->HasInput("RankOffset"), true, PADDLE_ENFORCE_EQ(ctx->HasInput("RankOffset"), true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(RankOffset) should not be null")); "Input(RankOffset) should not be null"));
PADDLE_ENFORCE_EQ(ctx->HasInput("InputHelp"), true,
platform::errors::InvalidArgument(
"Input(InputHelp) should not be null"));
PADDLE_ENFORCE_EQ(
ctx->HasInput("InsRank"), true,
platform::errors::InvalidArgument("Input(InsRank) should not be null"));
ctx->SetOutputDim(framework::GradVarName("RankParam"), ctx->SetOutputDim(framework::GradVarName("RankParam"),
ctx->GetInputDim("RankParam")); ctx->GetInputDim("RankParam"));
...@@ -99,9 +117,15 @@ class RankAttentionOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -99,9 +117,15 @@ class RankAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) Input tensor of rank_attention_Op operator."); "(Tensor) Input tensor of rank_attention_Op operator.");
AddInput("RankParam", AddInput("RankParam",
"(Tensor) Input tensor of rank_attention_Op operator."); "(Tensor) Input tensor of rank_attention_Op operator.");
AddOutput("InputHelp", "Output tensor of rank_attention_Op operator.")
.AsDispensable();
AddOutput("Out", "Output tensor of rank_attention_Op operator."); AddOutput("Out", "Output tensor of rank_attention_Op operator.");
AddOutput("InsRank", "Output tensor of rank_attention_Op operator.")
.AsDispensable();
AddAttr<int>("MaxRank", "(int, default 3) max rank of rank_attention_Op") AddAttr<int>("MaxRank", "(int, default 3) max rank of rank_attention_Op")
.SetDefault(3); .SetDefault(3);
AddAttr<int>("MaxSize", "(int, default 0) max rank of rank_attention_Op")
.SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
RankAttention Operator. RankAttention Operator.
This Op can calculate rank attention between input and rank_param, This Op can calculate rank attention between input and rank_param,
...@@ -123,7 +147,9 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -123,7 +147,9 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
op->SetInput("X", this->Input("X")); op->SetInput("X", this->Input("X"));
op->SetInput("RankOffset", this->Input("RankOffset")); op->SetInput("RankOffset", this->Input("RankOffset"));
op->SetInput("RankParam", this->Input("RankParam")); op->SetInput("RankParam", this->Input("RankParam"));
op->SetInput("InputHelp", this->Output("InputHelp"));
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetInput("InsRank", this->Output("InsRank"));
op->SetOutput(framework::GradVarName("RankParam"), op->SetOutput(framework::GradVarName("RankParam"),
this->InputGrad("RankParam")); this->InputGrad("RankParam"));
...@@ -131,7 +157,8 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -131,7 +157,8 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERER( DECLARE_NO_NEED_BUFFER_VARS_INFERER(
RankAttentionGradOpNoNeedBufferVarsInference, "RankParam"); RankAttentionGradOpNoNeedBufferVarsInference, "X", "RankOffset",
"RankParam");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cublas.h> #include <cublas.h>
#include <algorithm>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention.cu.h"
...@@ -32,7 +33,10 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> { ...@@ -32,7 +33,10 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
auto *X = ctx.Input<Tensor>("X"); auto *X = ctx.Input<Tensor>("X");
auto *rank_offset = ctx.Input<Tensor>("RankOffset"); auto *rank_offset = ctx.Input<Tensor>("RankOffset");
auto *param = ctx.Input<Tensor>("RankParam"); auto *param = ctx.Input<Tensor>("RankParam");
auto *input_help = ctx.Output<Tensor>("InputHelp");
auto *ins_rank = ctx.Output<Tensor>("InsRank");
int max_rank = ctx.Attr<int>("MaxRank"); int max_rank = ctx.Attr<int>("MaxRank");
int64_t max_size = ctx.Attr<int>("MaxSize");
auto *Out = ctx.Output<Tensor>("Out"); auto *Out = ctx.Output<Tensor>("Out");
// check dims // check dims
...@@ -56,37 +60,42 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> { ...@@ -56,37 +60,42 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
int block_matrix_row = max_rank * x_fea_dim; int block_matrix_row = max_rank * x_fea_dim;
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto stream = ctx.cuda_device_context().stream();
int device_id = platform::GetCurrentDeviceId();
T *param_help_data;
auto param_help_size = ins_num * block_matrix_row * para_col * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&param_help_data),
param_help_size, device_id);
platform::GpuMemsetAsync(param_help_data, 0, param_help_size, stream);
T *input_help_data;
auto input_help_size = ins_num * block_matrix_row * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&input_help_data),
input_help_size, device_id);
platform::GpuMemsetAsync(input_help_data, 0, input_help_size, stream);
T *ins_rank_data;
auto ins_rank_size = ins_num * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&ins_rank_data),
ins_rank_size, device_id);
platform::GpuMemsetAsync(ins_rank_data, -1, ins_rank_size, stream);
int max_ins = std::max(ins_num, max_size);
Tensor param_help;
param_help = ctx.AllocateTmpTensor<T, DeviceContext>(
{max_ins * block_matrix_row, para_col}, dev_ctx);
param_help.mutable_data<T>(ctx.GetPlace());
input_help->Resize({max_ins, block_matrix_row});
ins_rank->Resize({max_ins, 1});
input_help->mutable_data<T>(ctx.GetPlace());
ins_rank->mutable_data<T>(ctx.GetPlace());
Out->mutable_data<T>(ctx.GetPlace()); Out->mutable_data<T>(ctx.GetPlace());
// initialize // initialize
auto param_help_eigen = framework::EigenVector<T>::Flatten(param_help);
auto input_help_eigen = framework::EigenVector<T>::Flatten(*input_help);
auto ins_rank_eigen = framework::EigenVector<T>::Flatten(*ins_rank);
auto out_eigen = framework::EigenVector<T>::Flatten(*Out); auto out_eigen = framework::EigenVector<T>::Flatten(*Out);
auto &place = *ctx.template device_context<platform::CUDADeviceContext>() auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device(); .eigen_device();
param_help_eigen.device(place) =
param_help_eigen.constant(static_cast<T>(0));
input_help_eigen.device(place) =
input_help_eigen.constant(static_cast<T>(0));
ins_rank_eigen.device(place) = ins_rank_eigen.constant(static_cast<T>(-1));
out_eigen.device(place) = out_eigen.constant(static_cast<T>(0)); out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));
// get data ptr // get data ptr
T *input_help_data = input_help->data<T>();
T *param_help_data = param_help.data<T>();
T *ins_rank_data = ins_rank->data<T>();
T *out_data = Out->data<T>(); T *out_data = Out->data<T>();
expand_rank_attention_input( expand_rank_attention_input(
ctx.cuda_device_context().stream(), X->data<T>(), ins_num, x_fea_dim, ctx.cuda_device_context().stream(), X->data<T>(), ins_num, x_fea_dim,
input_help_data, ins_num, block_matrix_row, rank_offset->data<int>(), input_help_data, ins_num, block_matrix_row, rank_offset->data<int>(),
...@@ -110,10 +119,6 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> { ...@@ -110,10 +119,6 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
blas.BatchedGEMM(transA, transB, 1, para_col, block_matrix_row, alpha, blas.BatchedGEMM(transA, transB, 1, para_col, block_matrix_row, alpha,
input_help_data, param_help_data, beta, out_data, ins_num, input_help_data, param_help_data, beta, out_data, ins_num,
strideA, strideB); strideA, strideB);
platform::RecordedCudaFree(param_help_data, param_help_size, device_id);
platform::RecordedCudaFree(input_help_data, input_help_size, device_id);
platform::RecordedCudaFree(ins_rank_data, ins_rank_size, device_id);
} }
}; };
...@@ -121,10 +126,13 @@ template <typename DeviceContext, typename T> ...@@ -121,10 +126,13 @@ template <typename DeviceContext, typename T>
class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> { class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto *X = ctx.Input<Tensor>("X"); auto *X = ctx.Input<Tensor>("X"); // not use data
auto *rank_offset = ctx.Input<Tensor>("RankOffset"); auto *rank_offset = ctx.Input<Tensor>("RankOffset"); // not use data
auto *param = ctx.Input<Tensor>("RankParam"); auto *param = ctx.Input<Tensor>("RankParam"); // not use data
auto *input_help = ctx.Input<Tensor>("InputHelp");
auto *ins_rank = ctx.Input<Tensor>("InsRank");
auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
int64_t max_size = ctx.Attr<int>("MaxSize");
auto *drank_para = ctx.Output<Tensor>(framework::GradVarName("RankParam")); auto *drank_para = ctx.Output<Tensor>(framework::GradVarName("RankParam"));
...@@ -142,38 +150,26 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -142,38 +150,26 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
auto &place = *ctx.template device_context<platform::CUDADeviceContext>() auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device(); .eigen_device();
int max_ins = std::max(ins_num, max_size);
// initialize out grad // initialize out grad
drank_para->mutable_data<T>(ctx.GetPlace()); drank_para->mutable_data<T>(ctx.GetPlace());
auto drank_para_eigen = framework::EigenVector<T>::Flatten(*drank_para); auto drank_para_eigen = framework::EigenVector<T>::Flatten(*drank_para);
drank_para_eigen.device(place) = drank_para_eigen.device(place) =
drank_para_eigen.constant(static_cast<T>(0)); drank_para_eigen.constant(static_cast<T>(0));
auto stream = ctx.cuda_device_context().stream(); // copy data
int device_id = platform::GetCurrentDeviceId(); Tensor param_grad;
param_grad = ctx.AllocateTmpTensor<T, DeviceContext>(
T *param_grad_data; {max_ins * block_matrix_row, para_col}, dev_ctx);
auto param_grad_size = ins_num * block_matrix_row * para_col * sizeof(T); param_grad.mutable_data<T>(ctx.GetPlace());
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&param_grad_data), // initialize
param_grad_size, device_id); auto param_grad_eigen = framework::EigenVector<T>::Flatten(param_grad);
platform::GpuMemsetAsync(param_grad_data, 0, param_grad_size, stream); param_grad_eigen.device(place) =
param_grad_eigen.constant(static_cast<T>(0));
T *input_help_data; // get data ptr
auto input_help_size = ins_num * block_matrix_row * sizeof(T); const T *input_help_data = input_help->data<T>();
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&input_help_data), const T *ins_rank_data = ins_rank->data<T>();
input_help_size, device_id); T *param_grad_data = param_grad.data<T>();
platform::GpuMemsetAsync(input_help_data, 0, input_help_size, stream);
T *ins_rank_data;
auto ins_rank_size = ins_num * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&ins_rank_data),
ins_rank_size, device_id);
platform::GpuMemsetAsync(ins_rank_data, -1, ins_rank_size, stream);
// expand input
expand_rank_attention_input(
ctx.cuda_device_context().stream(), X->data<T>(), ins_num, x_fea_dim,
input_help_data, ins_num, block_matrix_row, rank_offset->data<int>(),
rank_offset_dims[0], rank_offset_dims[1], ins_rank_data, max_rank);
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
T alpha = 1; T alpha = 1;
...@@ -184,20 +180,14 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -184,20 +180,14 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
CBLAS_TRANSPOSE transB = CblasNoTrans; CBLAS_TRANSPOSE transB = CblasNoTrans;
int64_t strideA = block_matrix_row; int64_t strideA = block_matrix_row;
int64_t strideB = para_col; int64_t strideB = para_col;
blas.BatchedGEMM(transA, transB, block_matrix_row, para_col, 1, alpha, blas.BatchedGEMM(transA, transB, block_matrix_row, para_col, 1, alpha,
input_help_data, dout->data<T>(), beta, param_grad_data, input_help_data, dout->data<T>(), beta, param_grad_data,
ins_num, strideA, strideB); ins_num, strideA, strideB);
// merge param_grad to get drank_para // merge param_grad to get drank_para
merge_rank_attention_param_grad( merge_rank_attention_param_grad(
ctx.cuda_device_context().stream(), param_grad_data, ctx.cuda_device_context().stream(), param_grad_data,
ins_num * block_matrix_row, para_col, drank_para->data<T>(), para_row, ins_num * block_matrix_row, para_col, drank_para->data<T>(), para_row,
para_col, ins_rank_data, ins_num, max_rank, x_fea_dim); para_col, ins_rank_data, ins_num, max_rank, x_fea_dim);
platform::RecordedCudaFree(param_grad_data, param_grad_size, device_id);
platform::RecordedCudaFree(input_help_data, input_help_size, device_id);
platform::RecordedCudaFree(ins_rank_data, ins_rank_size, device_id);
} }
}; };
......
...@@ -1236,7 +1236,8 @@ def rank_attention(input, ...@@ -1236,7 +1236,8 @@ def rank_attention(input,
rank_offset, rank_offset,
rank_param_shape, rank_param_shape,
rank_param_attr, rank_param_attr,
max_rank=3): max_rank=3,
max_size=0):
""" """
**Rank Attention layer** **Rank Attention layer**
This Op can calculate rank attention between input and rank_param, and This Op can calculate rank attention between input and rank_param, and
...@@ -1266,7 +1267,8 @@ def rank_attention(input, ...@@ -1266,7 +1267,8 @@ def rank_attention(input,
name="ubm_rank_param.w_0", name="ubm_rank_param.w_0",
initializer= initializer=
fluid.initializer.Xavier(uniform=False)), fluid.initializer.Xavier(uniform=False)),
max_rank=3) max_rank=3,
max_size=0)
""" """
helper = LayerHelper('rank_attention', **locals()) helper = LayerHelper('rank_attention', **locals())
dtype = helper.input_dtype(input_param_name='input') dtype = helper.input_dtype(input_param_name='input')
...@@ -1278,6 +1280,8 @@ def rank_attention(input, ...@@ -1278,6 +1280,8 @@ def rank_attention(input,
rank_param.stop_gradient = False rank_param.stop_gradient = False
output = helper.create_variable_for_type_inference(dtype) output = helper.create_variable_for_type_inference(dtype)
input_help = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
ins_rank = helper.create_variable_for_type_inference( ins_rank = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True) dtype=dtype, stop_gradient=True)
...@@ -1288,7 +1292,9 @@ def rank_attention(input, ...@@ -1288,7 +1292,9 @@ def rank_attention(input,
"RankOffset": rank_offset, "RankOffset": rank_offset,
"RankParam": rank_param "RankParam": rank_param
}, },
outputs={"Out": output}, outputs={"Out": output,
attrs={"MaxRank": max_rank}) "InputHelp": input_help,
"InsRank": ins_rank},
attrs={"MaxRank": max_rank,
"MaxSize": max_size})
return output return output
...@@ -22,10 +22,11 @@ from op_test import OpTest, skip_check_grad_ci ...@@ -22,10 +22,11 @@ from op_test import OpTest, skip_check_grad_ci
import paddle.fluid.core as core import paddle.fluid.core as core
def gen_input_help(input, rank_offset, max_rank): def gen_input_help(input, rank_offset, max_rank, max_size):
input_row, input_col = input.shape input_row, input_col = input.shape
input_help = np.zeros((input_row * max_rank * input_col, )) max_ins = np.max((max_size, input_row))
ins_rank = np.zeros((input_row, 1)) input_help = np.zeros((max_ins * max_rank * input_col))
ins_rank = np.zeros((max_ins, 1))
ins_rank.fill(-1) ins_rank.fill(-1)
output_col = max_rank * input_col output_col = max_rank * input_col
...@@ -46,7 +47,7 @@ def gen_input_help(input, rank_offset, max_rank): ...@@ -46,7 +47,7 @@ def gen_input_help(input, rank_offset, max_rank):
rank_input_col_idx = output_col_idx % input_col rank_input_col_idx = output_col_idx % input_col
index = rank_offset[output_row_idx, 2 * k + 2] index = rank_offset[output_row_idx, 2 * k + 2]
input_help[idx] = input[index, rank_input_col_idx] input_help[idx] = input[index, rank_input_col_idx]
input_help = input_help.reshape([input_row, max_rank * input_col]) input_help = input_help.reshape([max_ins, max_rank * input_col])
return input_help, ins_rank return input_help, ins_rank
...@@ -83,7 +84,7 @@ def gen_param_help(input, rank_offset, param, max_rank): ...@@ -83,7 +84,7 @@ def gen_param_help(input, rank_offset, param, max_rank):
return output_param return output_param
def np_rank_attention(input, rank_offset, rank_para, max_rank): def np_rank_attention(input, rank_offset, rank_para, max_rank, max_size):
input_row, input_col = input.shape input_row, input_col = input.shape
rank_offset_row, rank_offset_col = rank_offset.shape rank_offset_row, rank_offset_col = rank_offset.shape
rank_para_row, rank_para_col = rank_para.shape rank_para_row, rank_para_col = rank_para.shape
...@@ -92,7 +93,8 @@ def np_rank_attention(input, rank_offset, rank_para, max_rank): ...@@ -92,7 +93,8 @@ def np_rank_attention(input, rank_offset, rank_para, max_rank):
assert (max_rank == ((rank_offset_col - 1) / 2)) assert (max_rank == ((rank_offset_col - 1) / 2))
assert (rank_para_row == max_rank * max_rank * input_col) assert (rank_para_row == max_rank * max_rank * input_col)
input_help, ins_rank = gen_input_help(input, rank_offset, max_rank) input_help, ins_rank = gen_input_help(input, rank_offset, max_rank,
max_size)
param_help = gen_param_help(input, rank_offset, rank_para, max_rank) param_help = gen_param_help(input, rank_offset, rank_para, max_rank)
block_matrix_row = input_col * max_rank block_matrix_row = input_col * max_rank
...@@ -159,14 +161,19 @@ class TestRankAttentionOpComplex(OpTest): ...@@ -159,14 +161,19 @@ class TestRankAttentionOpComplex(OpTest):
] ]
rank_para = np.random.random(rank_para_shape).astype(self.dtype) rank_para = np.random.random(rank_para_shape).astype(self.dtype)
np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention( np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
input, np.array(rank_offset), rank_para, self.max_rank) input,
np.array(rank_offset), rank_para, self.max_rank, self.pv_num * 7)
self.inputs = { self.inputs = {
"X": input, "X": input,
"RankOffset": np.array(rank_offset).astype("int32"), "RankOffset": np.array(rank_offset).astype("int32"),
"RankParam": rank_para "RankParam": rank_para
} }
self.attrs = {'MaxRank': self.max_rank} self.attrs = {'MaxRank': self.max_rank, 'MaxSize': self.pv_num * 7}
self.outputs = {"Out": np_out} self.outputs = {
"Out": np_out,
"InputHelp": np_input_help,
"InsRank": np_ins_rank
}
def test_check_output_gpu(self): def test_check_output_gpu(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
...@@ -195,14 +202,19 @@ class TestRankAttentionOpCpu(OpTest): ...@@ -195,14 +202,19 @@ class TestRankAttentionOpCpu(OpTest):
] ]
rank_para = np.random.random(rank_para_shape).astype(self.dtype) rank_para = np.random.random(rank_para_shape).astype(self.dtype)
np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention( np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
input, np.array(rank_offset), rank_para, self.max_rank) input,
np.array(rank_offset), rank_para, self.max_rank, self.pv_num * 7)
self.inputs = { self.inputs = {
"X": input, "X": input,
"RankOffset": np.array(rank_offset).astype("int32"), "RankOffset": np.array(rank_offset).astype("int32"),
"RankParam": rank_para "RankParam": rank_para
} }
self.attrs = {'MaxRank': self.max_rank} self.attrs = {'MaxRank': self.max_rank, 'MaxSize': self.pv_num * 7}
self.outputs = {"Out": np_out} self.outputs = {
"Out": np_out,
"InputHelp": np_input_help,
"InsRank": np_ins_rank
}
def test_check_output_cpu(self): def test_check_output_cpu(self):
try: try:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册