未验证 提交 30bd7e1c 编写于 作者: S ShenLiang 提交者: GitHub

Add rank_attention_op attributes for GPU memory in contrib (#23915)

* optimize rank_attention, test=develop

* use the paddle memory pool, test=develop

* set max size, test=develop

* limit the max size, test=develop

* fix the head of cu, test=develop

* add AsDispensable, test=develop
上级 8d0b0cb4
......@@ -34,6 +34,14 @@ class RankAttentionOp : public framework::OperatorWithKernel {
ctx->HasInput("RankParam"), true,
platform::errors::InvalidArgument(
"Input(RankParam) of RankAttentionOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("InsRank"), true,
platform::errors::InvalidArgument(
"Output(InsRank) of RankAttentionOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("InputHelp"), true,
platform::errors::InvalidArgument(
"Output(InputHelp) of RankAttentionOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
......@@ -45,12 +53,16 @@ class RankAttentionOp : public framework::OperatorWithKernel {
auto param_dims = ctx->GetInputDim("RankParam");
auto para_col = param_dims[1];
auto rank_offset_dims = ctx->GetInputDim("RankOffset");
auto x_fea_dim = x_dims[1];
auto block_matrix_row = max_rank * x_fea_dim;
PADDLE_ENFORCE_EQ((rank_offset_dims[1] - 1) / 2, max_rank,
platform::errors::InvalidArgument(
"Input(RankOffset) has wrong columns."));
ctx->SetOutputDim("Out", {ins_num, para_col});
ctx->SetOutputDim("InputHelp", {ins_num, block_matrix_row});
ctx->SetOutputDim("InsRank", {ins_num, 1});
ctx->ShareLoD("X", /*->*/ "Out");
}
......@@ -77,6 +89,12 @@ class RankAttentionGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->HasInput("RankOffset"), true,
platform::errors::InvalidArgument(
"Input(RankOffset) should not be null"));
PADDLE_ENFORCE_EQ(ctx->HasInput("InputHelp"), true,
platform::errors::InvalidArgument(
"Input(InputHelp) should not be null"));
PADDLE_ENFORCE_EQ(
ctx->HasInput("InsRank"), true,
platform::errors::InvalidArgument("Input(InsRank) should not be null"));
ctx->SetOutputDim(framework::GradVarName("RankParam"),
ctx->GetInputDim("RankParam"));
......@@ -99,9 +117,15 @@ class RankAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) Input tensor of rank_attention_Op operator.");
AddInput("RankParam",
"(Tensor) Input tensor of rank_attention_Op operator.");
AddOutput("InputHelp", "Output tensor of rank_attention_Op operator.")
.AsDispensable();
AddOutput("Out", "Output tensor of rank_attention_Op operator.");
AddOutput("InsRank", "Output tensor of rank_attention_Op operator.")
.AsDispensable();
AddAttr<int>("MaxRank", "(int, default 3) max rank of rank_attention_Op")
.SetDefault(3);
AddAttr<int>("MaxSize", "(int, default 0) max rank of rank_attention_Op")
.SetDefault(0);
AddComment(R"DOC(
RankAttention Operator.
This Op can calculate rank attention between input and rank_param,
......@@ -123,7 +147,9 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
op->SetInput("X", this->Input("X"));
op->SetInput("RankOffset", this->Input("RankOffset"));
op->SetInput("RankParam", this->Input("RankParam"));
op->SetInput("InputHelp", this->Output("InputHelp"));
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetInput("InsRank", this->Output("InsRank"));
op->SetOutput(framework::GradVarName("RankParam"),
this->InputGrad("RankParam"));
......@@ -131,7 +157,8 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(
RankAttentionGradOpNoNeedBufferVarsInference, "RankParam");
RankAttentionGradOpNoNeedBufferVarsInference, "X", "RankOffset",
"RankParam");
} // namespace operators
} // namespace paddle
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <cublas.h>
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/rank_attention.cu.h"
......@@ -32,7 +33,10 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
auto *X = ctx.Input<Tensor>("X");
auto *rank_offset = ctx.Input<Tensor>("RankOffset");
auto *param = ctx.Input<Tensor>("RankParam");
auto *input_help = ctx.Output<Tensor>("InputHelp");
auto *ins_rank = ctx.Output<Tensor>("InsRank");
int max_rank = ctx.Attr<int>("MaxRank");
int64_t max_size = ctx.Attr<int>("MaxSize");
auto *Out = ctx.Output<Tensor>("Out");
// check dims
......@@ -56,37 +60,42 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
int block_matrix_row = max_rank * x_fea_dim;
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto stream = ctx.cuda_device_context().stream();
int device_id = platform::GetCurrentDeviceId();
T *param_help_data;
auto param_help_size = ins_num * block_matrix_row * para_col * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&param_help_data),
param_help_size, device_id);
platform::GpuMemsetAsync(param_help_data, 0, param_help_size, stream);
T *input_help_data;
auto input_help_size = ins_num * block_matrix_row * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&input_help_data),
input_help_size, device_id);
platform::GpuMemsetAsync(input_help_data, 0, input_help_size, stream);
T *ins_rank_data;
auto ins_rank_size = ins_num * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&ins_rank_data),
ins_rank_size, device_id);
platform::GpuMemsetAsync(ins_rank_data, -1, ins_rank_size, stream);
int max_ins = std::max(ins_num, max_size);
Tensor param_help;
param_help = ctx.AllocateTmpTensor<T, DeviceContext>(
{max_ins * block_matrix_row, para_col}, dev_ctx);
param_help.mutable_data<T>(ctx.GetPlace());
input_help->Resize({max_ins, block_matrix_row});
ins_rank->Resize({max_ins, 1});
input_help->mutable_data<T>(ctx.GetPlace());
ins_rank->mutable_data<T>(ctx.GetPlace());
Out->mutable_data<T>(ctx.GetPlace());
// initialize
auto param_help_eigen = framework::EigenVector<T>::Flatten(param_help);
auto input_help_eigen = framework::EigenVector<T>::Flatten(*input_help);
auto ins_rank_eigen = framework::EigenVector<T>::Flatten(*ins_rank);
auto out_eigen = framework::EigenVector<T>::Flatten(*Out);
auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
param_help_eigen.device(place) =
param_help_eigen.constant(static_cast<T>(0));
input_help_eigen.device(place) =
input_help_eigen.constant(static_cast<T>(0));
ins_rank_eigen.device(place) = ins_rank_eigen.constant(static_cast<T>(-1));
out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));
// get data ptr
T *input_help_data = input_help->data<T>();
T *param_help_data = param_help.data<T>();
T *ins_rank_data = ins_rank->data<T>();
T *out_data = Out->data<T>();
expand_rank_attention_input(
ctx.cuda_device_context().stream(), X->data<T>(), ins_num, x_fea_dim,
input_help_data, ins_num, block_matrix_row, rank_offset->data<int>(),
......@@ -110,10 +119,6 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
blas.BatchedGEMM(transA, transB, 1, para_col, block_matrix_row, alpha,
input_help_data, param_help_data, beta, out_data, ins_num,
strideA, strideB);
platform::RecordedCudaFree(param_help_data, param_help_size, device_id);
platform::RecordedCudaFree(input_help_data, input_help_size, device_id);
platform::RecordedCudaFree(ins_rank_data, ins_rank_size, device_id);
}
};
......@@ -121,10 +126,13 @@ template <typename DeviceContext, typename T>
class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *X = ctx.Input<Tensor>("X");
auto *rank_offset = ctx.Input<Tensor>("RankOffset");
auto *param = ctx.Input<Tensor>("RankParam");
auto *X = ctx.Input<Tensor>("X"); // not use data
auto *rank_offset = ctx.Input<Tensor>("RankOffset"); // not use data
auto *param = ctx.Input<Tensor>("RankParam"); // not use data
auto *input_help = ctx.Input<Tensor>("InputHelp");
auto *ins_rank = ctx.Input<Tensor>("InsRank");
auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
int64_t max_size = ctx.Attr<int>("MaxSize");
auto *drank_para = ctx.Output<Tensor>(framework::GradVarName("RankParam"));
......@@ -142,38 +150,26 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
int max_ins = std::max(ins_num, max_size);
// initialize out grad
drank_para->mutable_data<T>(ctx.GetPlace());
auto drank_para_eigen = framework::EigenVector<T>::Flatten(*drank_para);
drank_para_eigen.device(place) =
drank_para_eigen.constant(static_cast<T>(0));
auto stream = ctx.cuda_device_context().stream();
int device_id = platform::GetCurrentDeviceId();
T *param_grad_data;
auto param_grad_size = ins_num * block_matrix_row * para_col * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&param_grad_data),
param_grad_size, device_id);
platform::GpuMemsetAsync(param_grad_data, 0, param_grad_size, stream);
T *input_help_data;
auto input_help_size = ins_num * block_matrix_row * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&input_help_data),
input_help_size, device_id);
platform::GpuMemsetAsync(input_help_data, 0, input_help_size, stream);
T *ins_rank_data;
auto ins_rank_size = ins_num * sizeof(T);
platform::RecordedCudaMalloc(reinterpret_cast<void **>(&ins_rank_data),
ins_rank_size, device_id);
platform::GpuMemsetAsync(ins_rank_data, -1, ins_rank_size, stream);
// expand input
expand_rank_attention_input(
ctx.cuda_device_context().stream(), X->data<T>(), ins_num, x_fea_dim,
input_help_data, ins_num, block_matrix_row, rank_offset->data<int>(),
rank_offset_dims[0], rank_offset_dims[1], ins_rank_data, max_rank);
// copy data
Tensor param_grad;
param_grad = ctx.AllocateTmpTensor<T, DeviceContext>(
{max_ins * block_matrix_row, para_col}, dev_ctx);
param_grad.mutable_data<T>(ctx.GetPlace());
// initialize
auto param_grad_eigen = framework::EigenVector<T>::Flatten(param_grad);
param_grad_eigen.device(place) =
param_grad_eigen.constant(static_cast<T>(0));
// get data ptr
const T *input_help_data = input_help->data<T>();
const T *ins_rank_data = ins_rank->data<T>();
T *param_grad_data = param_grad.data<T>();
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
T alpha = 1;
......@@ -184,20 +180,14 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
CBLAS_TRANSPOSE transB = CblasNoTrans;
int64_t strideA = block_matrix_row;
int64_t strideB = para_col;
blas.BatchedGEMM(transA, transB, block_matrix_row, para_col, 1, alpha,
input_help_data, dout->data<T>(), beta, param_grad_data,
ins_num, strideA, strideB);
// merge param_grad to get drank_para
merge_rank_attention_param_grad(
ctx.cuda_device_context().stream(), param_grad_data,
ins_num * block_matrix_row, para_col, drank_para->data<T>(), para_row,
para_col, ins_rank_data, ins_num, max_rank, x_fea_dim);
platform::RecordedCudaFree(param_grad_data, param_grad_size, device_id);
platform::RecordedCudaFree(input_help_data, input_help_size, device_id);
platform::RecordedCudaFree(ins_rank_data, ins_rank_size, device_id);
}
};
......
......@@ -1236,7 +1236,8 @@ def rank_attention(input,
rank_offset,
rank_param_shape,
rank_param_attr,
max_rank=3):
max_rank=3,
max_size=0):
"""
**Rank Attention layer**
This Op can calculate rank attention between input and rank_param, and
......@@ -1266,7 +1267,8 @@ def rank_attention(input,
name="ubm_rank_param.w_0",
initializer=
fluid.initializer.Xavier(uniform=False)),
max_rank=3)
max_rank=3,
max_size=0)
"""
helper = LayerHelper('rank_attention', **locals())
dtype = helper.input_dtype(input_param_name='input')
......@@ -1278,6 +1280,8 @@ def rank_attention(input,
rank_param.stop_gradient = False
output = helper.create_variable_for_type_inference(dtype)
input_help = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
ins_rank = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
......@@ -1288,7 +1292,9 @@ def rank_attention(input,
"RankOffset": rank_offset,
"RankParam": rank_param
},
outputs={"Out": output},
attrs={"MaxRank": max_rank})
outputs={"Out": output,
"InputHelp": input_help,
"InsRank": ins_rank},
attrs={"MaxRank": max_rank,
"MaxSize": max_size})
return output
......@@ -22,10 +22,11 @@ from op_test import OpTest, skip_check_grad_ci
import paddle.fluid.core as core
def gen_input_help(input, rank_offset, max_rank):
def gen_input_help(input, rank_offset, max_rank, max_size):
input_row, input_col = input.shape
input_help = np.zeros((input_row * max_rank * input_col, ))
ins_rank = np.zeros((input_row, 1))
max_ins = np.max((max_size, input_row))
input_help = np.zeros((max_ins * max_rank * input_col))
ins_rank = np.zeros((max_ins, 1))
ins_rank.fill(-1)
output_col = max_rank * input_col
......@@ -46,7 +47,7 @@ def gen_input_help(input, rank_offset, max_rank):
rank_input_col_idx = output_col_idx % input_col
index = rank_offset[output_row_idx, 2 * k + 2]
input_help[idx] = input[index, rank_input_col_idx]
input_help = input_help.reshape([input_row, max_rank * input_col])
input_help = input_help.reshape([max_ins, max_rank * input_col])
return input_help, ins_rank
......@@ -83,7 +84,7 @@ def gen_param_help(input, rank_offset, param, max_rank):
return output_param
def np_rank_attention(input, rank_offset, rank_para, max_rank):
def np_rank_attention(input, rank_offset, rank_para, max_rank, max_size):
input_row, input_col = input.shape
rank_offset_row, rank_offset_col = rank_offset.shape
rank_para_row, rank_para_col = rank_para.shape
......@@ -92,7 +93,8 @@ def np_rank_attention(input, rank_offset, rank_para, max_rank):
assert (max_rank == ((rank_offset_col - 1) / 2))
assert (rank_para_row == max_rank * max_rank * input_col)
input_help, ins_rank = gen_input_help(input, rank_offset, max_rank)
input_help, ins_rank = gen_input_help(input, rank_offset, max_rank,
max_size)
param_help = gen_param_help(input, rank_offset, rank_para, max_rank)
block_matrix_row = input_col * max_rank
......@@ -159,14 +161,19 @@ class TestRankAttentionOpComplex(OpTest):
]
rank_para = np.random.random(rank_para_shape).astype(self.dtype)
np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
input, np.array(rank_offset), rank_para, self.max_rank)
input,
np.array(rank_offset), rank_para, self.max_rank, self.pv_num * 7)
self.inputs = {
"X": input,
"RankOffset": np.array(rank_offset).astype("int32"),
"RankParam": rank_para
}
self.attrs = {'MaxRank': self.max_rank}
self.outputs = {"Out": np_out}
self.attrs = {'MaxRank': self.max_rank, 'MaxSize': self.pv_num * 7}
self.outputs = {
"Out": np_out,
"InputHelp": np_input_help,
"InsRank": np_ins_rank
}
def test_check_output_gpu(self):
if core.is_compiled_with_cuda():
......@@ -195,14 +202,19 @@ class TestRankAttentionOpCpu(OpTest):
]
rank_para = np.random.random(rank_para_shape).astype(self.dtype)
np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
input, np.array(rank_offset), rank_para, self.max_rank)
input,
np.array(rank_offset), rank_para, self.max_rank, self.pv_num * 7)
self.inputs = {
"X": input,
"RankOffset": np.array(rank_offset).astype("int32"),
"RankParam": rank_para
}
self.attrs = {'MaxRank': self.max_rank}
self.outputs = {"Out": np_out}
self.attrs = {'MaxRank': self.max_rank, 'MaxSize': self.pv_num * 7}
self.outputs = {
"Out": np_out,
"InputHelp": np_input_help,
"InsRank": np_ins_rank
}
def test_check_output_cpu(self):
try:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册