未验证 提交 071708fa 编写于 作者: T taixiurong 提交者: GitHub

xpu-paddlepaddle-41 [任务] ffn and attention test=kunlun (#46658)

上级 b4460eee
......@@ -38,6 +38,8 @@ if(WITH_XPU)
op_library(resnet_basic_block_op)
op_library(resnet_unit_op)
op_library(fused_gemm_epilogue_op)
op_library(fused_attention_op)
op_library(fused_feedforward_op)
endif()
if(WITH_GPU OR WITH_ROCM)
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/fused/xpu_fused_common_function.h"
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle {
namespace operators {
using Tensor = phi::DenseTensor;
template <typename DeviceContext, typename T>
class FusedAttentionOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using XPUTypeT = typename XPUTypeTrait<T>::Type;
// inputs tensor
auto *input_x = ctx.Input<Tensor>("X");
const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
// shape [3, num_head, dim_head, dim_embed]
auto *qkv_weight = ctx.Input<Tensor>("QKVW");
// shape [3 , num_head, dim_head]
auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
// shape [batch_size, 1, 1, seq_len]
auto *src_mask = ctx.Input<Tensor>("SrcMask");
// shape [dim_embed, dim_embed]
auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
// shape [dim_embed]
auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
const Tensor *ln_scale = nullptr;
const Tensor *ln_bias = nullptr;
float epsilon = 0.0f;
if (pre_layer_norm) {
ln_scale = ctx.Input<Tensor>("LnScale");
ln_bias = ctx.Input<Tensor>("LnBias");
epsilon = ctx.Attr<float>("epsilon");
} else {
ln_scale = ctx.Input<Tensor>("Ln2Scale");
ln_bias = ctx.Input<Tensor>("Ln2Bias");
epsilon = ctx.Attr<float>("ln_epsilon");
}
// outputs tensor
// qkv 的值,并已经做了transpos后的值
// shape [3, batch_size, num_head, seq_len, dim_head]
auto *TransposeOut2 = ctx.Output<Tensor>("TransposeOut2");
// shape [batch_size, num_head, seq_len, seq_len]
auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
// shape [batch_size, num_head, seq_len, seq_len]
auto *attn_dropout_mask_out = ctx.Output<Tensor>("AttnDropoutMaskOut");
// shape [batch_size, num_head, seq_len, seq_len]
auto *attn_dropout_out = ctx.Output<Tensor>("AttnDropoutOut");
// shape [[batch_size, seq_len, num_head, dim_head]]
auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
// shape [batch_size, seq_len, dim_embed]
auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
// final output
// shape [batch_size, seq_len, dim_embed]
auto *out = ctx.Output<Tensor>("Y");
// 下面这个tensor是不需要返回, 但是新的动态图需要
auto *QKOut = ctx.Output<Tensor>("QKOut");
QKOut->mutable_data<T>(ctx.GetPlace());
auto *QKTVOut = ctx.Output<Tensor>("QKTVOut");
QKTVOut->mutable_data<T>(ctx.GetPlace());
auto *OutLinearOut = ctx.Output<Tensor>("OutLinearOut");
OutLinearOut->mutable_data<T>(ctx.GetPlace());
auto *QKVBiasOut = ctx.Output<Tensor>("QKVBiasOut");
QKVBiasOut->mutable_data<T>(ctx.GetPlace());
auto *SrcMaskOut = ctx.Output<Tensor>("SrcMaskOut");
SrcMaskOut->mutable_data<T>(ctx.GetPlace());
auto *qkv_out = ctx.Output<Tensor>("QKVOut");
qkv_out->mutable_data<T>(ctx.GetPlace());
Tensor *bias_dropout_residual_out = nullptr;
Tensor *ln_mean = nullptr;
Tensor *ln_var = nullptr;
Tensor *ln_out = nullptr;
if (pre_layer_norm) {
ln_mean = ctx.Output<Tensor>("LnMean");
ln_var = ctx.Output<Tensor>("LnVariance");
ln_out = ctx.Output<Tensor>("LnOut");
} else {
ln_mean = ctx.Output<Tensor>("Ln2Mean");
ln_var = ctx.Output<Tensor>("Ln2Variance");
bias_dropout_residual_out = ctx.Output<Tensor>("BiasDropoutResidualOut");
}
// dropout info
float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
bool is_test_1 = ctx.Attr<bool>("is_test");
auto &dropout_implementation_1 =
ctx.Attr<std::string>("attn_dropout_implementation");
bool is_upscale_in_train_1 =
(dropout_implementation_1 == "upscale_in_train");
auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
XPUDropoutParam attn_dropout_param;
attn_dropout_param.initXPUDropoutParam(attn_dropout_rate,
is_upscale_in_train_1,
is_test_1,
is_fix_seed_1,
seed_1,
seed_val_1);
XPUDropoutParam dropout_param(ctx, 0);
// 先计算纬度
const auto input_x_dims = input_x->dims();
const auto qkv_w_dims = qkv_weight->dims();
int batch_size = input_x_dims[0];
int seq_len = input_x_dims[1];
int embed_dims = input_x_dims[2];
int num_heads = qkv_w_dims[1];
int head_dims = qkv_w_dims[2];
// 输入指针
const XPUTypeT *input_x_ptr =
reinterpret_cast<const XPUTypeT *>(input_x->data<T>());
const XPUTypeT *qkv_weight_ptr =
reinterpret_cast<const XPUTypeT *>(qkv_weight->data<T>());
const XPUTypeT *qkv_bias_ptr =
reinterpret_cast<const XPUTypeT *>(qkv_bias->data<T>());
const XPUTypeT *src_mask_ptr =
(src_mask == nullptr)
? (nullptr)
: (reinterpret_cast<const XPUTypeT *>(src_mask->data<T>()));
const XPUTypeT *out_linear_weight_ptr =
reinterpret_cast<const XPUTypeT *>(out_linear_weight->data<T>());
const XPUTypeT *out_linear_bias_ptr =
reinterpret_cast<const XPUTypeT *>(out_linear_bias->data<T>());
const float *ln_scale_ptr =
(ln_scale == nullptr) ? (nullptr) : ln_scale->data<float>();
const float *ln_bias_ptr =
(ln_bias == nullptr) ? (nullptr) : ln_bias->data<float>();
// 输出指针
XPUTypeT *qkv_transpose_out_ptr = reinterpret_cast<XPUTypeT *>(
TransposeOut2->mutable_data<T>(ctx.GetPlace()));
XPUTypeT *softmax_out_ptr = reinterpret_cast<XPUTypeT *>(
softmax_out->mutable_data<T>(ctx.GetPlace()));
XPUTypeT *attn_dropout_mask_out_ptr = reinterpret_cast<XPUTypeT *>(
attn_dropout_mask_out->mutable_data<T>(ctx.GetPlace()));
XPUTypeT *attn_dropout_out_ptr = reinterpret_cast<XPUTypeT *>(
attn_dropout_out->mutable_data<T>(ctx.GetPlace()));
XPUTypeT *fmha_out_ptr =
reinterpret_cast<XPUTypeT *>(fmha_out->mutable_data<T>(ctx.GetPlace()));
XPUTypeT *dropout_mask_out_ptr = reinterpret_cast<XPUTypeT *>(
dropout_mask_out->mutable_data<T>(ctx.GetPlace()));
XPUTypeT *out_ptr =
reinterpret_cast<XPUTypeT *>(out->mutable_data<T>(ctx.GetPlace()));
XPUTypeT *bias_dropout_residual_out_ptr =
(bias_dropout_residual_out == nullptr)
? (nullptr)
: (reinterpret_cast<XPUTypeT *>(
bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace())));
float *ln_mean_ptr = (ln_mean == nullptr)
? (nullptr)
: ln_mean->mutable_data<float>(ctx.GetPlace());
float *ln_var_ptr = (ln_var == nullptr)
? (nullptr)
: ln_var->mutable_data<float>(ctx.GetPlace());
XPUTypeT *ln_out_ptr = (ln_out == nullptr)
? (nullptr)
: (reinterpret_cast<XPUTypeT *>(
ln_out->mutable_data<T>(ctx.GetPlace())));
auto &dev_ctx = ctx.template device_context<DeviceContext>();
xpu::Context *xpu_ctx = dev_ctx.x_context();
xpu::ctx_guard RAII_GUARD(xpu_ctx);
int l3_total_size = xpu_ctx->_l3_mgr.get_size();
XPUTypeT *qkv_before_transpos_ptr =
NULL; // x2[batch_size, seq_len, 3, num_heads,head_dims]
XPUTypeT *qk_ptr = NULL; // qk [batch_size, num_heads, seq_len, seq_len]
XPUTypeT *qkv_ptr = NULL; // qkv[batch_size, num_heads, seq_len, head_dims]
XPUTypeT *linear_out_ptr =
NULL; // x4, x5 [batch_size, seq_len, embed_dims]
int temp_size_1 = batch_size * seq_len * 3 * num_heads * head_dims;
int temp_size_2 = batch_size * num_heads * seq_len * seq_len;
int temp_size_3 = batch_size * num_heads * seq_len * head_dims;
int temp_size_4 = batch_size * seq_len * embed_dims;
std::vector<int> temp_vec = {
temp_size_1, temp_size_2, temp_size_3, temp_size_4};
std::sort(temp_vec.begin(), temp_vec.end(), std::greater<int>());
XPUTypeT *max_gm_ptr = RAII_GUARD.alloc<XPUTypeT>(temp_vec[0]);
PADDLE_ENFORCE_XDNN_NOT_NULL(max_gm_ptr);
qkv_before_transpos_ptr = max_gm_ptr;
qk_ptr = max_gm_ptr;
qkv_ptr = max_gm_ptr;
linear_out_ptr = max_gm_ptr;
int sizeof_t = sizeof(XPUTypeT);
for (size_t i = 0; i < temp_vec.size(); ++i) {
if (l3_total_size >= temp_vec[i] * sizeof_t) {
XPUTypeT *l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(temp_vec[i]);
qkv_before_transpos_ptr =
(temp_size_1 <= temp_vec[i]) ? l3_ptr : max_gm_ptr;
qk_ptr = (temp_size_2 <= temp_vec[i]) ? l3_ptr : max_gm_ptr;
qkv_ptr = (temp_size_3 <= temp_vec[i]) ? l3_ptr : max_gm_ptr;
linear_out_ptr = (temp_size_4 <= temp_vec[i]) ? l3_ptr : max_gm_ptr;
break;
}
}
int r = 0;
const XPUTypeT *x_cacl_ptr = input_x_ptr;
if (pre_layer_norm) {
r = xpu::layer_norm(xpu_ctx,
input_x_ptr,
ln_out_ptr,
batch_size * seq_len,
embed_dims,
epsilon,
ln_scale_ptr,
ln_bias_ptr,
ln_mean_ptr,
ln_var_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm");
x_cacl_ptr = ln_out_ptr;
}
// fc
phi::XpuFcInfo qkv_fc_info;
qkv_fc_info.InitFcInfo(0,
batch_size * seq_len,
3 * num_heads * head_dims,
embed_dims,
false,
true,
nullptr,
nullptr,
nullptr);
phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
x_cacl_ptr,
qkv_weight_ptr,
qkv_before_transpos_ptr,
qkv_fc_info,
1.0f);
// bias
r = xpu::broadcast_add(xpu_ctx,
qkv_before_transpos_ptr,
qkv_bias_ptr,
qkv_before_transpos_ptr,
{batch_size * seq_len, 3 * num_heads * head_dims},
{3 * num_heads * head_dims});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
// transpose
r = xpu::transpose(xpu_ctx,
qkv_before_transpos_ptr,
qkv_transpose_out_ptr,
{batch_size, seq_len, 3, num_heads, head_dims},
{2, 0, 3, 1, 4});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
int qkv_every_size = batch_size * seq_len * num_heads * head_dims;
{
float alpha = 1.0 / sqrt(head_dims);
r = scale(xpu_ctx,
qkv_transpose_out_ptr,
qkv_transpose_out_ptr,
qkv_every_size,
false,
alpha,
0.0f);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
}
// begin fhma
// 1. qk 2. qk + mask 3. softmax 4.dropout 5. qkv 6. transpos
{
const XPUTypeT *q_ptr = qkv_transpose_out_ptr;
const XPUTypeT *k_ptr = q_ptr + qkv_every_size;
const XPUTypeT *v_ptr = k_ptr + qkv_every_size;
phi::XpuFcInfo qk_fc_info;
qk_fc_info.InitFcInfo(batch_size * num_heads,
seq_len,
seq_len,
head_dims,
false,
true,
nullptr,
nullptr,
nullptr);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, q_ptr, k_ptr, qk_ptr, qk_fc_info, 1.0f);
if (src_mask_ptr) {
r = xpu::broadcast_add(xpu_ctx,
qk_ptr,
src_mask_ptr,
qk_ptr,
{batch_size, num_heads, seq_len, seq_len},
{batch_size, 1, 1, seq_len});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
}
// do softmax
r = xpu::softmax(xpu_ctx,
qk_ptr,
softmax_out_ptr,
{batch_size, num_heads, seq_len, seq_len},
3);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
// do dropout
Dropout<XPUTypeT>(xpu_ctx,
softmax_out_ptr,
attn_dropout_mask_out_ptr,
attn_dropout_out_ptr,
attn_dropout_param,
batch_size * num_heads * seq_len * seq_len);
phi::XpuFcInfo qktv_fc_info;
qktv_fc_info.InitFcInfo(batch_size * num_heads,
seq_len,
head_dims,
seq_len,
false,
false,
nullptr,
nullptr,
nullptr);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, attn_dropout_out_ptr, v_ptr, qkv_ptr, qktv_fc_info, 1.0f);
r = xpu::transpose(xpu_ctx,
qkv_ptr,
fmha_out_ptr,
{batch_size, num_heads, seq_len, head_dims},
{0, 2, 1, 3});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
}
// linear_out
phi::XpuFcInfo linear_fc_info;
linear_fc_info.InitFcInfo(0,
batch_size * seq_len,
embed_dims,
embed_dims,
false,
false,
nullptr,
nullptr,
nullptr);
phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
fmha_out_ptr,
out_linear_weight_ptr,
linear_out_ptr,
linear_fc_info,
1.0f);
// out_linear_bias_ptr
r = xpu::broadcast_add(xpu_ctx,
linear_out_ptr,
out_linear_bias_ptr,
linear_out_ptr,
{batch_size * seq_len, embed_dims},
{embed_dims});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
Dropout(xpu_ctx,
linear_out_ptr,
dropout_mask_out_ptr,
linear_out_ptr,
dropout_param,
batch_size * seq_len * embed_dims);
XPUTypeT *real_out_ptr = out_ptr;
if (pre_layer_norm == false) {
real_out_ptr = bias_dropout_residual_out_ptr;
}
r = xpu::add(xpu_ctx,
linear_out_ptr,
input_x_ptr,
real_out_ptr,
batch_size * seq_len * embed_dims);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "add");
if (pre_layer_norm == false) {
r = xpu::layer_norm(xpu_ctx,
real_out_ptr,
out_ptr,
batch_size * seq_len,
embed_dims,
epsilon,
ln_scale_ptr,
ln_bias_ptr,
ln_mean_ptr,
ln_var_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm");
}
}
};
// template <typename T>
template <typename DeviceContext, typename T>
class FusedAttentionGradXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using XPUTypeT = typename XPUTypeTrait<T>::Type;
const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
// dropout info
float attn_dropout_prob = ctx.Attr<float>("attn_dropout_rate");
bool is_test_1 = ctx.Attr<bool>("is_test");
auto &dropout_implementation_1 =
ctx.Attr<std::string>("attn_dropout_implementation");
bool is_upscale_in_train_1 =
(dropout_implementation_1 == "upscale_in_train");
auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
XPUDropoutParam attn_dropout_param;
attn_dropout_param.initXPUDropoutParam(attn_dropout_prob,
is_upscale_in_train_1,
is_test_1,
is_fix_seed_1,
seed_1,
seed_val_1);
XPUDropoutParam dropout_param(ctx, 0);
// get inputs.
auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const XPUTypeT *d_y_ptr =
reinterpret_cast<const XPUTypeT *>(d_y->data<T>());
// 前向必要参数
auto *input_x = ctx.Input<Tensor>("X");
const XPUTypeT *input_x_ptr =
reinterpret_cast<const XPUTypeT *>(input_x->data<T>());
auto *qkv_transpose_out = ctx.Input<Tensor>("TransposeOut2");
const XPUTypeT *qkv_transpose_out_ptr =
reinterpret_cast<const XPUTypeT *>(qkv_transpose_out->data<T>());
auto *qkv_weight = ctx.Input<Tensor>("QKVW");
const XPUTypeT *qkv_weight_ptr =
reinterpret_cast<const XPUTypeT *>(qkv_weight->data<T>());
auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
const XPUTypeT *softmax_out_ptr =
reinterpret_cast<const XPUTypeT *>(softmax_out->data<T>());
auto *attn_dropout_out = ctx.Input<Tensor>("AttnDropoutOut");
const XPUTypeT *attn_dropout_out_ptr =
reinterpret_cast<const XPUTypeT *>(attn_dropout_out->data<T>());
auto *attn_dropout_mask = ctx.Input<Tensor>("AttnDropoutMaskOut");
const XPUTypeT *attn_dropout_mask_ptr =
reinterpret_cast<const XPUTypeT *>(attn_dropout_mask->data<T>());
auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
const XPUTypeT *fmha_out_ptr =
reinterpret_cast<const XPUTypeT *>(fmha_out->data<T>());
auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
const XPUTypeT *out_linear_weight_ptr =
reinterpret_cast<const XPUTypeT *>(out_linear_weight->data<T>());
auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
const XPUTypeT *dropout_mask_out_ptr =
reinterpret_cast<const XPUTypeT *>(dropout_mask_out->data<T>());
// 需要计算的梯度
auto *d_qkv_weight = ctx.Output<Tensor>(framework::GradVarName("QKVW"));
XPUTypeT *d_qkv_weight_ptr = reinterpret_cast<XPUTypeT *>(
d_qkv_weight->mutable_data<T>(ctx.GetPlace()));
auto *d_qkv_bias = ctx.Output<Tensor>(framework::GradVarName("QKVBias"));
XPUTypeT *d_qkv_bias_ptr = reinterpret_cast<XPUTypeT *>(
d_qkv_bias->mutable_data<T>(ctx.GetPlace()));
auto *d_out_linear_weight =
ctx.Output<Tensor>(framework::GradVarName("OutLinearW"));
XPUTypeT *d_out_linear_weight_ptr = reinterpret_cast<XPUTypeT *>(
d_out_linear_weight->mutable_data<T>(ctx.GetPlace()));
auto *d_out_linear_bias =
ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
XPUTypeT *d_out_linear_bias_ptr = reinterpret_cast<XPUTypeT *>(
d_out_linear_bias->mutable_data<T>(ctx.GetPlace()));
// 有可能需要
auto *d_src_mask_out =
ctx.Output<Tensor>(framework::GradVarName("SrcMaskOut"));
XPUTypeT *d_src_mask_out_ptr =
(d_src_mask_out == nullptr)
? (nullptr)
: (reinterpret_cast<XPUTypeT *>(
d_src_mask_out->mutable_data<T>(ctx.GetPlace())));
// 输出 dx
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
XPUTypeT *d_x_ptr =
reinterpret_cast<XPUTypeT *>(d_x->mutable_data<T>(ctx.GetPlace()));
const Tensor *ln_out = nullptr;
const Tensor *bias_dropout_residual_out = nullptr;
const Tensor *ln_scale = nullptr;
const Tensor *ln_mean = nullptr;
const Tensor *ln_var = nullptr;
Tensor *d_ln_scale = nullptr;
Tensor *d_ln_bias = nullptr;
const XPUTypeT *ln_out_ptr = NULL;
const float *ln_scale_ptr = NULL;
const float *ln_mean_ptr = NULL;
const float *ln_var_ptr = NULL;
const XPUTypeT *bias_dropout_residual_out_ptr = NULL;
float *d_ln_scale_ptr = nullptr;
float *d_ln_bias_ptr = nullptr;
float epsilon = 0.0f;
if (pre_layer_norm) {
ln_out = ctx.Input<Tensor>("LnOut");
ln_out_ptr = reinterpret_cast<const XPUTypeT *>(ln_out->data<T>());
ln_scale = ctx.Input<Tensor>("LnScale");
ln_mean = ctx.Input<Tensor>("LnMean");
ln_var = ctx.Input<Tensor>("LnVariance");
epsilon = ctx.Attr<float>("epsilon");
d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
} else {
ln_scale = ctx.Input<Tensor>("Ln2Scale");
ln_mean = ctx.Input<Tensor>("Ln2Mean");
ln_var = ctx.Input<Tensor>("Ln2Variance");
epsilon = ctx.Attr<float>("ln_epsilon");
d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("Ln2Scale"));
d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("Ln2Bias"));
bias_dropout_residual_out = ctx.Input<Tensor>("BiasDropoutResidualOut");
bias_dropout_residual_out_ptr = reinterpret_cast<const XPUTypeT *>(
bias_dropout_residual_out->data<T>());
}
ln_scale_ptr = ln_scale->data<float>();
ln_mean_ptr = ln_mean->data<float>();
ln_var_ptr = ln_var->data<float>();
d_ln_scale_ptr = d_ln_scale->mutable_data<float>(ctx.GetPlace());
d_ln_bias_ptr = d_ln_bias->mutable_data<float>(ctx.GetPlace());
const auto input_x_dims = input_x->dims();
const auto qkv_w_dims = qkv_weight->dims();
int batch_size = input_x_dims[0];
int seq_len = input_x_dims[1];
int embed_dims = input_x_dims[2];
int num_heads = qkv_w_dims[1];
int head_dims = qkv_w_dims[2];
auto &dev_ctx = ctx.template device_context<DeviceContext>();
xpu::Context *xpu_ctx = dev_ctx.x_context();
xpu::ctx_guard RAII_GUARD(xpu_ctx);
int r = 0;
// int l3_total_size = xpu_ctx->_l3_mgr.get_size();
XPUTypeT *d_ln_grad_ptr = NULL; // dx5 [batch_size, seq_len, hidden]
XPUTypeT *d_dropout_grad_ptr = NULL; // dx5 [batch_size, seq_len, hidden]
XPUTypeT *d_fmha_out_ptr =
NULL; // d_fmha_out [batch_size, seq_len, num_heads, head_dims]
XPUTypeT *d_fmha_out_transpos_tmp_ptr =
NULL; // d_fmha_out_transpos [batch_size, seq_len, num_heads,
// head_dims]
XPUTypeT *d_qk_ptr =
NULL; // d_qk_ptr[batch_size, num_heads, seq_len, seq_len]
XPUTypeT *d_combination_qkv_ptr =
NULL; // d_combination_qkv_ptr[3, batch_size, num_heads, seq_len,
// head_dims]
XPUTypeT *d_transpos_qkv_ptr =
NULL; // dx2 [batch_size, seq_len, 3, num_heads, head_dims]
XPUTypeT *d_last_layernorm_grad_ptr =
NULL; // d_layer_out [batch_size, seq_len, embed_dims]
const XPUTypeT *dy_input_ptr = d_y_ptr;
d_ln_grad_ptr =
RAII_GUARD.alloc<XPUTypeT>(batch_size * seq_len * embed_dims);
d_dropout_grad_ptr =
RAII_GUARD.alloc_l3_or_gm<XPUTypeT>(batch_size * seq_len * embed_dims);
d_fmha_out_ptr = RAII_GUARD.alloc_l3_or_gm<XPUTypeT>(batch_size * seq_len *
num_heads * head_dims);
d_combination_qkv_ptr =
RAII_GUARD.alloc<XPUTypeT>(batch_size * seq_len * embed_dims * 3);
d_transpos_qkv_ptr = RAII_GUARD.alloc_l3_or_gm<XPUTypeT>(
batch_size * seq_len * embed_dims * 3);
d_fmha_out_transpos_tmp_ptr =
RAII_GUARD.alloc_l3_or_gm<XPUTypeT>(batch_size * seq_len * embed_dims);
d_qk_ptr = RAII_GUARD.alloc_l3_or_gm<XPUTypeT>(batch_size * seq_len *
seq_len * num_heads);
d_last_layernorm_grad_ptr =
RAII_GUARD.alloc_l3_or_gm<XPUTypeT>(batch_size * seq_len * embed_dims);
if (pre_layer_norm == false) {
r = xpu::layer_norm_grad(xpu_ctx,
bias_dropout_residual_out_ptr,
d_y_ptr,
d_ln_grad_ptr,
batch_size * seq_len,
embed_dims,
epsilon,
ln_scale_ptr,
ln_mean_ptr,
ln_var_ptr,
d_ln_scale_ptr,
d_ln_bias_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad");
dy_input_ptr = d_ln_grad_ptr;
}
// dropout_grad
DropoutGrad<XPUTypeT>(xpu_ctx,
dy_input_ptr,
dropout_mask_out_ptr,
d_dropout_grad_ptr,
dropout_param,
batch_size * num_heads * seq_len * head_dims);
// linear_out
phi::XpuFcInfo linear_fc_info;
linear_fc_info.InitFcInfo(0,
batch_size * seq_len,
embed_dims,
embed_dims,
false,
false,
nullptr,
nullptr,
nullptr);
const XPUTypeT *a_1 = reinterpret_cast<const XPUTypeT *>(NULL);
const XPUTypeT *b_1 = reinterpret_cast<const XPUTypeT *>(NULL);
const XPUTypeT *a_2 = reinterpret_cast<const XPUTypeT *>(NULL);
const XPUTypeT *b_2 = reinterpret_cast<const XPUTypeT *>(NULL);
XPUTypeT *c_1 = d_fmha_out_ptr;
XPUTypeT *c_2 = d_out_linear_weight_ptr;
phi::XpuFcInfo info_dfmha;
phi::XpuFcInfo info_dlinear_w;
std::tuple<phi::XpuFcInfo,
phi::XpuFcInfo,
const XPUTypeT *,
const XPUTypeT *,
const XPUTypeT *,
const XPUTypeT *>
fc_info = phi::MatmulGradFcInfo(xpu_ctx,
&RAII_GUARD,
linear_fc_info,
false,
false,
fmha_out_ptr,
out_linear_weight_ptr,
d_dropout_grad_ptr);
std::tie(info_dfmha, info_dlinear_w, a_1, b_1, a_2, b_2) = fc_info;
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_2, b_2, c_2, info_dlinear_w, 1.0f, true);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_1, b_1, c_1, info_dfmha, 1.0f, true);
// dlinear_bias
r = xpu::reduce_sum(xpu_ctx,
d_dropout_grad_ptr,
d_out_linear_bias_ptr,
{batch_size * seq_len, embed_dims},
{0});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
{
int qkv_size = batch_size * seq_len * num_heads * head_dims;
const XPUTypeT *q_out_ptr = qkv_transpose_out_ptr;
const XPUTypeT *k_out_ptr = q_out_ptr + qkv_size;
const XPUTypeT *v_out_ptr = k_out_ptr + qkv_size;
XPUTypeT *d_q_out_ptr = d_combination_qkv_ptr;
XPUTypeT *d_k_out_ptr = d_q_out_ptr + qkv_size;
XPUTypeT *d_v_out_ptr = d_k_out_ptr + qkv_size;
r = xpu::transpose<XPUTypeT>(xpu_ctx,
d_fmha_out_ptr,
d_fmha_out_transpos_tmp_ptr,
{batch_size, seq_len, num_heads, head_dims},
{0, 2, 1, 3});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
phi::XpuFcInfo qktv_fc_info;
qktv_fc_info.InitFcInfo(batch_size * num_heads,
seq_len,
head_dims,
seq_len,
false,
false,
nullptr,
nullptr,
nullptr);
const XPUTypeT *a_1 = reinterpret_cast<const XPUTypeT *>(NULL);
const XPUTypeT *b_1 = reinterpret_cast<const XPUTypeT *>(NULL);
const XPUTypeT *a_2 = reinterpret_cast<const XPUTypeT *>(NULL);
const XPUTypeT *b_2 = reinterpret_cast<const XPUTypeT *>(NULL);
XPUTypeT *c_1 = d_qk_ptr;
XPUTypeT *c_2 = d_v_out_ptr;
phi::XpuFcInfo info_d_qk;
phi::XpuFcInfo info_d_v;
std::tuple<phi::XpuFcInfo,
phi::XpuFcInfo,
const XPUTypeT *,
const XPUTypeT *,
const XPUTypeT *,
const XPUTypeT *>
fc_info = phi::MatmulGradFcInfo(xpu_ctx,
&RAII_GUARD,
qktv_fc_info,
false,
false,
attn_dropout_out_ptr,
v_out_ptr,
d_fmha_out_transpos_tmp_ptr);
std::tie(info_d_qk, info_d_v, a_1, b_1, a_2, b_2) = fc_info;
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_1, b_1, c_1, info_d_qk, 1.0f, true);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_2, b_2, c_2, info_d_v, 1.0f, true);
DropoutGrad<XPUTypeT>(xpu_ctx,
d_qk_ptr,
attn_dropout_mask_ptr,
d_qk_ptr,
attn_dropout_param,
batch_size * seq_len * seq_len * num_heads);
r = xpu::softmax_grad<XPUTypeT>(xpu_ctx,
softmax_out_ptr,
d_qk_ptr,
d_qk_ptr,
{batch_size, num_heads, seq_len, seq_len},
3);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
if (d_src_mask_out_ptr) {
r = xpu::copy<XPUTypeT>(xpu_ctx,
d_qk_ptr,
d_src_mask_out_ptr,
batch_size * seq_len * seq_len * num_heads);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
}
phi::XpuFcInfo qk_fc_info;
qk_fc_info.InitFcInfo(batch_size * num_heads,
seq_len,
seq_len,
head_dims,
false,
true,
nullptr,
nullptr,
nullptr);
a_1 = reinterpret_cast<const XPUTypeT *>(NULL);
b_1 = reinterpret_cast<const XPUTypeT *>(NULL);
a_2 = reinterpret_cast<const XPUTypeT *>(NULL);
b_2 = reinterpret_cast<const XPUTypeT *>(NULL);
c_1 = d_q_out_ptr;
c_2 = d_k_out_ptr;
phi::XpuFcInfo info_d_q;
phi::XpuFcInfo info_d_k;
fc_info = phi::MatmulGradFcInfo(xpu_ctx,
&RAII_GUARD,
qk_fc_info,
false,
true,
q_out_ptr,
k_out_ptr,
d_qk_ptr);
std::tie(info_d_q, info_d_k, a_1, b_1, a_2, b_2) = fc_info;
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_1, b_1, c_1, info_d_q, 1.0f / sqrt(head_dims), true);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_2, b_2, c_2, info_d_k, 1.0f, true);
}
//
r = xpu::transpose<XPUTypeT>(xpu_ctx,
d_combination_qkv_ptr,
d_transpos_qkv_ptr,
{3, batch_size, num_heads, seq_len, head_dims},
{1, 3, 0, 2, 4});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
// dx and d_qkv_w
phi::XpuFcInfo qkv_fc_info;
qkv_fc_info.InitFcInfo(0,
batch_size * seq_len,
3 * num_heads * head_dims,
embed_dims,
false,
true,
nullptr,
nullptr,
nullptr);
a_1 = reinterpret_cast<const XPUTypeT *>(NULL);
b_1 = reinterpret_cast<const XPUTypeT *>(NULL);
a_2 = reinterpret_cast<const XPUTypeT *>(NULL);
b_2 = reinterpret_cast<const XPUTypeT *>(NULL);
c_1 = (pre_layer_norm == true) ? d_last_layernorm_grad_ptr : d_x_ptr;
c_2 = d_qkv_weight_ptr;
phi::XpuFcInfo info_d_x;
phi::XpuFcInfo info_d_qkv_w;
const XPUTypeT *use_calc_input_x_ptr =
(pre_layer_norm == true) ? ln_out_ptr : input_x_ptr;
fc_info = phi::MatmulGradFcInfo(xpu_ctx,
&RAII_GUARD,
qkv_fc_info,
false,
true,
use_calc_input_x_ptr,
qkv_weight_ptr,
d_transpos_qkv_ptr);
std::tie(info_d_x, info_d_qkv_w, a_1, b_1, a_2, b_2) = fc_info;
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_1, b_1, c_1, info_d_x, 1.0f, true);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_2, b_2, c_2, info_d_qkv_w, 1.0f, true);
// d_qkv_bias
r = xpu::reduce_sum(xpu_ctx,
d_transpos_qkv_ptr,
d_qkv_bias_ptr,
{batch_size * seq_len, 3 * embed_dims},
{0});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
if (pre_layer_norm) {
r = xpu::layer_norm_grad(xpu_ctx,
input_x_ptr,
c_1,
d_x_ptr,
batch_size * seq_len,
embed_dims,
epsilon,
ln_scale_ptr,
ln_mean_ptr,
ln_var_ptr,
d_ln_scale_ptr,
d_ln_bias_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad");
}
// add rediaus dy
r = xpu::add(xpu_ctx,
dy_input_ptr,
d_x_ptr,
d_x_ptr,
batch_size * seq_len * embed_dims);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "add");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(
fused_attention,
ops::FusedAttentionOpKernel<phi::XPUContext, float>,
ops::FusedAttentionOpKernel<phi::XPUContext, paddle::platform::float16>);
REGISTER_OP_XPU_KERNEL(
fused_attention_grad,
ops::FusedAttentionGradXPUKernel<phi::XPUContext, float>,
ops::FusedAttentionGradXPUKernel<phi::XPUContext,
paddle::platform::float16>);
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/fluid/operators/fused/xpu_fused_common_function.h"
namespace paddle {
namespace operators {
using Tensor = phi::DenseTensor;
template <typename DeviceContext, typename T>
class FusedFeedForwardXPUKernel : public framework::OpKernel<T> {
using XPUTypeT = typename XPUTypeTrait<T>::Type;
public:
void FFN(const phi::XPUContext& dev_ctx,
const Tensor* x,
const Tensor* linear1_weight,
const Tensor* linear1_bias,
const Tensor* linear2_weight,
const Tensor* linear2_bias,
const Tensor* ln_scale,
const Tensor* ln_bias,
Tensor* out,
Tensor* dropout1_mask,
Tensor* dropout2_mask,
Tensor* ln_mean,
Tensor* ln_variance,
Tensor* linear1_out,
Tensor* ln1_out,
Tensor* dropout1_out,
Tensor* dropout2_out,
const int bsz_seq,
const int d_model,
const int dim_feedforward,
const std::string& act_method,
const bool pre_layer_norm,
const float epsilon1,
const float epsilon2,
const XPUDropoutParam& dropout_param1,
const XPUDropoutParam& dropout_param2,
int ring_id) const {
xpu::Context* xpu_ctx = dev_ctx.x_context();
xpu::ctx_guard RAII_GUARD(xpu_ctx);
int r = xpu::SUCCESS;
const XPUTypeT* x_ptr = reinterpret_cast<const XPUTypeT*>(x->data<T>());
const XPUTypeT* residual_ptr = x_ptr;
const XPUTypeT* linear1_weight_ptr =
reinterpret_cast<const XPUTypeT*>(linear1_weight->data<T>());
const XPUTypeT* linear1_bias_ptr =
reinterpret_cast<const XPUTypeT*>(linear1_bias->data<T>());
const XPUTypeT* linear2_weight_ptr =
reinterpret_cast<const XPUTypeT*>(linear2_weight->data<T>());
const XPUTypeT* linear2_bias_ptr =
reinterpret_cast<const XPUTypeT*>(linear2_bias->data<T>());
const float* ln_scale_ptr = ln_scale->data<float>();
const float* ln_bias_ptr = ln_bias->data<float>();
// out
XPUTypeT* out_ptr = reinterpret_cast<XPUTypeT*>(out->data<T>());
XPUTypeT* linear1_out_ptr =
reinterpret_cast<XPUTypeT*>(linear1_out->data<T>());
XPUTypeT* dropout1_mask_ptr =
reinterpret_cast<XPUTypeT*>(dropout1_mask->data<T>());
XPUTypeT* dropout2_mask_ptr =
reinterpret_cast<XPUTypeT*>(dropout2_mask->data<T>());
float* ln_mean_ptr = ln_mean->data<float>();
float* ln_variance_ptr = ln_variance->data<float>();
XPUTypeT* dropout1_out_ptr =
reinterpret_cast<XPUTypeT*>(dropout1_out->data<T>());
XPUTypeT* dropout2_out_ptr =
reinterpret_cast<XPUTypeT*>(dropout2_out->data<T>());
size_t l3_total_size = xpu_ctx->_l3_mgr.get_size();
XPUTypeT* linear2_before_tmp_ptr = NULL; // dim_feedforward * bsz_seq
XPUTypeT* linear2_after_tmp_ptr = NULL; // d_model * bsz_seq
if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) {
XPUTypeT* l3_ptr =
RAII_GUARD.alloc_l3<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr);
linear2_before_tmp_ptr = linear2_after_tmp_ptr = l3_ptr;
} else if ((l3_total_size < dim_feedforward * bsz_seq * sizeof(T)) &&
(l3_total_size >= d_model * bsz_seq * sizeof(T))) {
XPUTypeT* l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(d_model * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr);
linear2_after_tmp_ptr = l3_ptr;
linear2_before_tmp_ptr =
RAII_GUARD.alloc<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(linear2_before_tmp_ptr);
} else {
XPUTypeT* gm_ptr = RAII_GUARD.alloc<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(gm_ptr);
linear2_before_tmp_ptr = linear2_after_tmp_ptr = gm_ptr;
}
// layernorm
if (pre_layer_norm) {
XPUTypeT* ln1_out_ptr = reinterpret_cast<XPUTypeT*>(ln1_out->data<T>());
r = xpu::layer_norm(xpu_ctx,
x_ptr,
ln1_out_ptr,
bsz_seq,
d_model,
epsilon1,
ln_scale_ptr,
ln_bias_ptr,
ln_mean_ptr,
ln_variance_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm ");
x_ptr = ln1_out_ptr;
}
// fc
phi::XpuFcInfo linear1_fc_info;
linear1_fc_info.InitFcInfo(0,
bsz_seq,
dim_feedforward,
d_model,
false,
false,
nullptr,
nullptr,
nullptr);
phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
x_ptr,
linear1_weight_ptr,
linear2_before_tmp_ptr,
linear1_fc_info,
1.0f);
// bias
r = xpu::broadcast_add(xpu_ctx,
linear2_before_tmp_ptr,
linear1_bias_ptr,
linear1_out_ptr,
{bsz_seq, dim_feedforward},
{dim_feedforward});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
// act
if (act_method == "gelu") {
r = xpu::gelu(xpu_ctx,
linear1_out_ptr,
linear2_before_tmp_ptr,
linear1_out->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu");
} else if (act_method == "relu") {
r = xpu::relu(xpu_ctx,
linear1_out_ptr,
linear2_before_tmp_ptr,
linear1_out->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently only supports gelu or relu activation functions!"));
}
// dropout1
Dropout<XPUTypeT>(xpu_ctx,
linear2_before_tmp_ptr,
dropout1_mask_ptr,
dropout1_out_ptr,
dropout_param1,
dropout1_out->numel());
// fc
phi::XpuFcInfo linear2_fc_info;
linear2_fc_info.InitFcInfo(0,
bsz_seq,
d_model,
dim_feedforward,
false,
false,
nullptr,
nullptr,
nullptr);
phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
dropout1_out_ptr,
linear2_weight_ptr,
dropout2_out_ptr,
linear2_fc_info,
1.0f);
// bias
r = xpu::broadcast_add(xpu_ctx,
dropout2_out_ptr,
linear2_bias_ptr,
dropout2_out_ptr,
{bsz_seq, d_model},
{d_model});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
// dropout2
Dropout<XPUTypeT>(xpu_ctx,
dropout2_out_ptr,
dropout2_mask_ptr,
dropout2_out_ptr,
dropout_param2,
dropout2_out->numel());
// residual_ptr + dropout_out
XPUTypeT* residual_add_out_ptr = out_ptr;
if (pre_layer_norm == false) {
residual_add_out_ptr = dropout2_out_ptr;
}
r = xpu::broadcast_add(xpu_ctx,
residual_ptr,
dropout2_out_ptr,
residual_add_out_ptr,
{bsz_seq, d_model},
{bsz_seq, d_model});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
if (pre_layer_norm == false) {
r = xpu::layer_norm(xpu_ctx,
residual_add_out_ptr,
out_ptr,
bsz_seq,
d_model,
epsilon2,
ln_scale_ptr,
ln_bias_ptr,
ln_mean_ptr,
ln_variance_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm");
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto place = context.GetPlace();
auto* x = context.Input<Tensor>("X");
auto* linear1_weight = context.Input<Tensor>("Linear1Weight");
auto* linear1_bias = context.Input<Tensor>("Linear1Bias");
auto* linear2_weight = context.Input<Tensor>("Linear2Weight");
auto* linear2_bias = context.Input<Tensor>("Linear2Bias");
const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
const Tensor* ln_scale = nullptr;
const Tensor* ln_bias = nullptr;
Tensor* ln_mean = nullptr;
Tensor* ln_variance = nullptr;
Tensor* ln1_out = nullptr;
if (pre_layer_norm) {
ln_scale = context.Input<Tensor>("Ln1Scale");
ln_bias = context.Input<Tensor>("Ln1Bias");
ln_mean = context.Output<Tensor>("Ln1Mean");
ln_variance = context.Output<Tensor>("Ln1Variance");
ln1_out = context.Output<Tensor>("Ln1Out");
ln1_out->mutable_data<T>(place);
} else {
ln_scale = context.Input<Tensor>("Ln2Scale");
ln_bias = context.Input<Tensor>("Ln2Bias");
ln_mean = context.Output<Tensor>("Ln2Mean");
ln_variance = context.Output<Tensor>("Ln2Variance");
}
auto* out = context.Output<Tensor>("Out");
auto* dropout1_mask = context.Output<Tensor>("Dropout1Mask");
auto* dropout2_mask = context.Output<Tensor>("Dropout2Mask");
auto* linear1_out = context.Output<Tensor>("Linear1Out");
auto* dropout1_out = context.Output<Tensor>("Dropout1Out");
auto* dropout2_out = context.Output<Tensor>("Dropout2Out");
const std::string act_method = context.Attr<std::string>("act_method");
const int ring_id = context.Attr<int>("ring_id");
const float epsilon1 = context.Attr<float>("ln1_epsilon");
const float epsilon2 = context.Attr<float>("ln2_epsilon");
XPUDropoutParam dropout_param1;
dropout_param1.initXPUDropoutParam(context, 1);
XPUDropoutParam dropout_param2;
dropout_param2.initXPUDropoutParam(context, 2);
ln_mean->mutable_data<float>(place);
ln_variance->mutable_data<float>(place);
out->mutable_data<T>(place);
dropout1_mask->mutable_data<T>(place);
dropout2_mask->mutable_data<T>(place);
dropout1_out->mutable_data<T>(place);
dropout2_out->mutable_data<T>(place);
linear1_out->mutable_data<T>(place);
auto x_dim = x->dims();
auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(
RowMatrixFromVector(x_dim), 0, false);
auto dim = linear1_weight->dims();
int d_model = dim[0];
int dim_feedforward = dim[dim.size() - 1];
int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_;
auto& dev_ctx = context.template device_context<phi::XPUContext>();
FFN(dev_ctx,
x,
linear1_weight,
linear1_bias,
linear2_weight,
linear2_bias,
ln_scale,
ln_bias,
out,
dropout1_mask,
dropout2_mask,
ln_mean,
ln_variance,
linear1_out,
ln1_out,
dropout1_out,
dropout2_out,
bsz_seq,
d_model,
dim_feedforward,
act_method,
pre_layer_norm,
epsilon1,
epsilon2,
dropout_param1,
dropout_param2,
ring_id);
}
};
template <typename DeviceContext, typename T>
class FusedFeedForwardGradXPUKernel : public framework::OpKernel<T> {
using XPUTypeT = typename XPUTypeTrait<T>::Type;
public:
void FFNGrad(const phi::XPUContext& dev_ctx,
const Tensor* d_out,
const Tensor* x,
const Tensor* dropout1_mask,
const Tensor* dropout2_mask,
const Tensor* linear1_out,
const Tensor* ln1_out,
const Tensor* dropout1_out,
const Tensor* dropout2_out,
const Tensor* linear1_weight,
const Tensor* linear2_weight,
const Tensor* ln_scale,
const Tensor* ln_mean,
const Tensor* ln_variance,
Tensor* d_x,
Tensor* d_linear1_weight,
Tensor* d_linear1_bias,
Tensor* d_linear2_weight,
Tensor* d_linear2_bias,
Tensor* d_ln_scale,
Tensor* d_ln_bias,
const int bsz_seq,
const int d_model,
const int dim_feedforward,
const XPUDropoutParam& dropout_param1,
const XPUDropoutParam& dropout_param2,
const std::string& act_method,
const bool pre_layer_norm,
const float epsilon,
const int ring_id) const {
xpu::Context* xpu_ctx = dev_ctx.x_context();
xpu::ctx_guard RAII_GUARD(xpu_ctx);
int r = xpu::SUCCESS;
// inputs ptr
const XPUTypeT* d_out_ptr =
reinterpret_cast<const XPUTypeT*>(d_out->data<T>());
const XPUTypeT* x_ptr = reinterpret_cast<const XPUTypeT*>(x->data<T>());
const XPUTypeT* dropout1_mask_ptr =
reinterpret_cast<const XPUTypeT*>(dropout1_mask->data<T>());
const XPUTypeT* dropout2_mask_ptr =
reinterpret_cast<const XPUTypeT*>(dropout2_mask->data<T>());
const XPUTypeT* linear1_out_ptr =
reinterpret_cast<const XPUTypeT*>(linear1_out->data<T>());
const XPUTypeT* dropout1_out_ptr =
reinterpret_cast<const XPUTypeT*>(dropout1_out->data<T>());
const XPUTypeT* linear1_weight_ptr =
reinterpret_cast<const XPUTypeT*>(linear1_weight->data<T>());
const XPUTypeT* linear2_weight_ptr =
reinterpret_cast<const XPUTypeT*>(linear2_weight->data<T>());
const float* ln_scale_ptr = ln_scale->data<float>();
const float* ln_mean_ptr = ln_mean->data<float>();
const float* ln_variance_ptr = ln_variance->data<float>();
// outputs ptr
XPUTypeT* d_x_ptr = reinterpret_cast<XPUTypeT*>(d_x->data<T>());
XPUTypeT* d_linear1_weight_ptr =
reinterpret_cast<XPUTypeT*>(d_linear1_weight->data<T>());
XPUTypeT* d_linear1_bias_ptr =
reinterpret_cast<XPUTypeT*>(d_linear1_bias->data<T>());
XPUTypeT* d_linear2_weight_ptr =
reinterpret_cast<XPUTypeT*>(d_linear2_weight->data<T>());
XPUTypeT* d_linear2_bias_ptr =
reinterpret_cast<XPUTypeT*>(d_linear2_bias->data<T>());
float* d_ln_scale_ptr = d_ln_scale->data<float>();
float* d_ln_bias_ptr = d_ln_bias->data<float>();
size_t l3_total_size = xpu_ctx->_l3_mgr.get_size();
XPUTypeT* big_tmp_l3_ptr = NULL; // dim_feedforward * bsz_seq
XPUTypeT* small_tmp_l3_ptr = NULL; // d_model * bsz_seq
XPUTypeT* big_tmp_gm_ptr = NULL; // dim_feedforward * bsz_seq
XPUTypeT* small_tmp_gm_ptr = NULL; // d_model * bsz_seq
XPUTypeT* d_layernorm_out_ptr = NULL; // dx9
XPUTypeT* d_dropout2_out_ptr = NULL; // dx7
XPUTypeT* d_linear2_out_ptr = NULL; // dx5
XPUTypeT* d_dropout1_out_ptr = NULL; // dx4
XPUTypeT* d_act_out_ptr = NULL; // dx3
XPUTypeT* d_linear1_out_ptr = NULL; // dx1
const XPUTypeT* d_residual_ptr = d_out_ptr;
if (l3_total_size >= (dim_feedforward * bsz_seq * sizeof(T) +
d_model * bsz_seq * sizeof(T))) {
big_tmp_l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr);
small_tmp_l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(d_model * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr);
d_layernorm_out_ptr = small_tmp_l3_ptr;
d_dropout2_out_ptr = small_tmp_l3_ptr;
d_linear2_out_ptr = big_tmp_l3_ptr;
d_dropout1_out_ptr = big_tmp_l3_ptr;
d_act_out_ptr = big_tmp_l3_ptr;
d_linear1_out_ptr = small_tmp_l3_ptr;
} else if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) {
big_tmp_l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr);
small_tmp_l3_ptr = big_tmp_l3_ptr;
big_tmp_gm_ptr = RAII_GUARD.alloc<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr);
small_tmp_gm_ptr = RAII_GUARD.alloc<XPUTypeT>(d_model * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr);
d_layernorm_out_ptr = small_tmp_l3_ptr;
d_dropout2_out_ptr = small_tmp_gm_ptr;
d_linear2_out_ptr = big_tmp_l3_ptr;
d_dropout1_out_ptr = big_tmp_l3_ptr;
d_act_out_ptr = big_tmp_gm_ptr;
d_linear1_out_ptr = small_tmp_l3_ptr;
} else if (l3_total_size >= d_model * bsz_seq * sizeof(T)) {
big_tmp_gm_ptr = RAII_GUARD.alloc<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr);
small_tmp_l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(d_model * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr);
d_layernorm_out_ptr = small_tmp_l3_ptr;
d_dropout2_out_ptr = small_tmp_l3_ptr;
d_linear2_out_ptr = big_tmp_gm_ptr;
d_dropout1_out_ptr = big_tmp_gm_ptr;
d_act_out_ptr = big_tmp_gm_ptr;
d_linear1_out_ptr = small_tmp_l3_ptr;
} else {
big_tmp_gm_ptr = RAII_GUARD.alloc<XPUTypeT>(dim_feedforward * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr);
small_tmp_gm_ptr = RAII_GUARD.alloc<XPUTypeT>(d_model * bsz_seq);
PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr);
d_layernorm_out_ptr = small_tmp_gm_ptr;
d_dropout2_out_ptr = small_tmp_gm_ptr;
d_linear2_out_ptr = big_tmp_gm_ptr;
d_dropout1_out_ptr = big_tmp_gm_ptr;
d_act_out_ptr = big_tmp_gm_ptr;
d_linear1_out_ptr = small_tmp_gm_ptr;
}
if (pre_layer_norm == false) {
const XPUTypeT* dropout2_out_ptr =
reinterpret_cast<const XPUTypeT*>(dropout2_out->data<T>());
r = xpu::layer_norm_grad(xpu_ctx,
dropout2_out_ptr,
d_out_ptr,
d_layernorm_out_ptr,
bsz_seq,
d_model,
epsilon,
ln_scale_ptr,
ln_mean_ptr,
ln_variance_ptr,
d_ln_scale_ptr,
d_ln_bias_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad");
d_residual_ptr = d_layernorm_out_ptr;
}
DropoutGrad(xpu_ctx,
d_residual_ptr,
dropout2_mask_ptr,
d_dropout2_out_ptr,
dropout_param2,
bsz_seq * d_model);
// linear_grad2
r = xpu::reduce_sum(xpu_ctx,
d_dropout2_out_ptr,
d_linear2_bias_ptr,
{bsz_seq, d_model},
{0});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
phi::XpuFcInfo linear2_fc_info;
linear2_fc_info.InitFcInfo(0,
bsz_seq,
d_model,
dim_feedforward,
false,
false,
nullptr,
nullptr,
nullptr);
const XPUTypeT* a_1 = reinterpret_cast<const XPUTypeT*>(NULL);
const XPUTypeT* b_1 = reinterpret_cast<const XPUTypeT*>(NULL);
const XPUTypeT* a_2 = reinterpret_cast<const XPUTypeT*>(NULL);
const XPUTypeT* b_2 = reinterpret_cast<const XPUTypeT*>(NULL);
XPUTypeT* c_1 = d_linear2_out_ptr;
XPUTypeT* c_2 = d_linear2_weight_ptr;
phi::XpuFcInfo info_d_dropout1;
phi::XpuFcInfo info_dw2;
std::tuple<phi::XpuFcInfo,
phi::XpuFcInfo,
const XPUTypeT*,
const XPUTypeT*,
const XPUTypeT*,
const XPUTypeT*>
fc_info = phi::MatmulGradFcInfo(xpu_ctx,
&RAII_GUARD,
linear2_fc_info,
false,
false,
dropout1_out_ptr,
linear2_weight_ptr,
d_dropout2_out_ptr);
std::tie(info_d_dropout1, info_dw2, a_1, b_1, a_2, b_2) = fc_info;
// if l3_total_size >= dim_feedforward * bsz_seq * sizeof(T), first transpos
if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T) &&
info_dw2.trans_x) {
r = xpu::transpose<XPUTypeT>(xpu_ctx,
dropout1_out_ptr,
big_tmp_l3_ptr,
{bsz_seq, dim_feedforward},
{1, 0});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
a_2 = big_tmp_l3_ptr;
info_dw2.trans_x = !info_dw2.trans_x;
info_dw2.stride_x = info_dw2.k;
}
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, true);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, true);
// dropout_grad1
DropoutGrad(xpu_ctx,
d_linear2_out_ptr,
dropout1_mask_ptr,
d_dropout1_out_ptr,
dropout_param1,
bsz_seq * dim_feedforward);
// act_grad
if (act_method == "gelu") {
r = xpu::gelu_grad(xpu_ctx,
linear1_out_ptr,
linear1_out_ptr,
d_dropout1_out_ptr,
d_act_out_ptr,
bsz_seq * dim_feedforward);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu_grad");
} else if (act_method == "relu") {
r = xpu::relu_grad(xpu_ctx,
linear1_out_ptr,
linear1_out_ptr,
d_dropout1_out_ptr,
d_act_out_ptr,
bsz_seq * dim_feedforward);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu_grad");
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently only supports gelu or relu activation functions!"));
}
// linear1_grad
r = xpu::reduce_sum(xpu_ctx,
d_act_out_ptr,
d_linear1_bias_ptr,
{bsz_seq, dim_feedforward},
{0});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
phi::XpuFcInfo linear1_fc_info;
linear1_fc_info.InitFcInfo(0,
bsz_seq,
dim_feedforward,
d_model,
false,
false,
nullptr,
nullptr,
nullptr);
a_1 = reinterpret_cast<const XPUTypeT*>(NULL);
b_1 = reinterpret_cast<const XPUTypeT*>(NULL);
a_2 = reinterpret_cast<const XPUTypeT*>(NULL);
b_2 = reinterpret_cast<const XPUTypeT*>(NULL);
c_1 = (pre_layer_norm == true ? d_linear1_out_ptr : d_x_ptr);
c_2 = d_linear1_weight_ptr;
phi::XpuFcInfo info_dx;
phi::XpuFcInfo info_dw1;
const XPUTypeT* linear1_x_ptr =
(pre_layer_norm == true
? reinterpret_cast<const XPUTypeT*>(ln1_out->data<T>())
: x_ptr);
if (l3_total_size >= d_model * bsz_seq * sizeof(T) && info_dw1.trans_x) {
r = xpu::transpose<XPUTypeT>(
xpu_ctx, linear1_x_ptr, small_tmp_l3_ptr, {bsz_seq, d_model}, {1, 0});
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
a_2 = small_tmp_l3_ptr;
info_dw1.trans_x = !info_dw1.trans_x;
info_dw1.stride_x = info_dw1.k;
}
fc_info = phi::MatmulGradFcInfo(xpu_ctx,
&RAII_GUARD,
linear1_fc_info,
false,
false,
linear1_x_ptr,
linear1_weight_ptr,
d_act_out_ptr);
std::tie(info_dx, info_dw1, a_1, b_1, a_2, b_2) = fc_info;
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, true);
phi::MatMulXPUFunction<XPUTypeT>(
xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, true);
if (pre_layer_norm) {
r = xpu::layer_norm_grad(xpu_ctx,
x_ptr,
c_1,
c_1,
bsz_seq,
d_model,
epsilon,
ln_scale_ptr,
ln_mean_ptr,
ln_variance_ptr,
d_ln_scale_ptr,
d_ln_bias_ptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad");
}
r = xpu::add(xpu_ctx, c_1, d_residual_ptr, d_x_ptr, d_model * bsz_seq);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "add");
}
void Compute(const framework::ExecutionContext& context) const override {
auto place = context.GetPlace();
const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
// inputs
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* x = context.Input<Tensor>("X");
auto* dropout1_mask = context.Input<Tensor>("Dropout1Mask");
auto* dropout2_mask = context.Input<Tensor>("Dropout2Mask");
auto* linear1_out = context.Input<Tensor>("Linear1Out");
auto* ln1_out = pre_layer_norm ? context.Input<Tensor>("Ln1Out") : nullptr;
auto* dropout1_out = context.Input<Tensor>("Dropout1Out");
auto* dropout2_out = context.Input<Tensor>("Dropout2Out");
auto* linear1_weight = context.Input<Tensor>("Linear1Weight");
auto* linear2_weight = context.Input<Tensor>("Linear2Weight");
const Tensor* ln_mean = nullptr;
const Tensor* ln_variance = nullptr;
const Tensor* ln_scale = nullptr;
if (pre_layer_norm) {
ln_mean = context.Input<Tensor>("Ln1Mean");
ln_variance = context.Input<Tensor>("Ln1Variance");
ln_scale = context.Input<Tensor>("Ln1Scale");
} else {
ln_mean = context.Input<Tensor>("Ln2Mean");
ln_variance = context.Input<Tensor>("Ln2Variance");
ln_scale = context.Input<Tensor>("Ln2Scale");
}
// output
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
Tensor* d_ln_scale = nullptr;
Tensor* d_ln_bias = nullptr;
if (pre_layer_norm) {
d_ln_scale = context.Output<Tensor>(framework::GradVarName("Ln1Scale"));
d_ln_bias = context.Output<Tensor>(framework::GradVarName("Ln1Bias"));
} else {
d_ln_scale = context.Output<Tensor>(framework::GradVarName("Ln2Scale"));
d_ln_bias = context.Output<Tensor>(framework::GradVarName("Ln2Bias"));
}
auto* d_linear1_weight =
context.Output<Tensor>(framework::GradVarName("Linear1Weight"));
auto* d_linear1_bias =
context.Output<Tensor>(framework::GradVarName("Linear1Bias"));
auto* d_linear2_weight =
context.Output<Tensor>(framework::GradVarName("Linear2Weight"));
auto* d_linear2_bias =
context.Output<Tensor>(framework::GradVarName("Linear2Bias"));
float epsilon = 0.0f;
if (pre_layer_norm) {
epsilon = context.Attr<float>("ln1_epsilon");
} else {
epsilon = context.Attr<float>("ln2_epsilon");
}
const std::string act_method = context.Attr<std::string>("act_method");
XPUDropoutParam dropout_param1(context, 1);
XPUDropoutParam dropout_param2(context, 2);
const int ring_id = context.Attr<int>("ring_id");
d_x->mutable_data<T>(place);
d_ln_scale->mutable_data<float>(place);
d_ln_bias->mutable_data<float>(place);
d_linear1_bias->mutable_data<T>(place);
d_linear2_bias->mutable_data<T>(place);
d_linear1_weight->mutable_data<T>(place);
d_linear2_weight->mutable_data<T>(place);
auto x_dim = x->dims();
auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(
RowMatrixFromVector(x_dim), 0, false);
auto linear1_weight_dim = linear1_weight->dims();
int d_model = linear1_weight_dim[0];
int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1];
int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_;
auto& dev_ctx = context.template device_context<phi::XPUContext>();
FFNGrad(dev_ctx,
d_out,
x,
dropout1_mask,
dropout2_mask,
linear1_out,
ln1_out,
dropout1_out,
dropout2_out,
linear1_weight,
linear2_weight,
ln_scale,
ln_mean,
ln_variance,
d_x,
d_linear1_weight,
d_linear1_bias,
d_linear2_weight,
d_linear2_bias,
d_ln_scale,
d_ln_bias,
bsz_seq,
d_model,
dim_feedforward,
dropout_param1,
dropout_param2,
act_method,
pre_layer_norm,
epsilon,
ring_id);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
fused_feedforward,
ops::FusedFeedForwardXPUKernel<phi::XPUContext, float>,
ops::FusedFeedForwardXPUKernel<phi::XPUContext, paddle::platform::float16>);
REGISTER_OP_XPU_KERNEL(
fused_feedforward_grad,
ops::FusedFeedForwardGradXPUKernel<phi::XPUContext, float>,
ops::FusedFeedForwardGradXPUKernel<phi::XPUContext,
paddle::platform::float16>);
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle {
namespace operators {
using Tensor = phi::DenseTensor;
struct XPUDropoutParam {
float dropout_prob;
bool is_upscale_in_train;
bool is_test;
bool fix_seed;
const Tensor *tensor_seed;
int seed_val;
XPUDropoutParam() {
fix_seed = false;
is_test = false;
is_upscale_in_train = false;
dropout_prob = 0.5;
tensor_seed = nullptr;
seed_val = 0;
}
XPUDropoutParam(const framework::ExecutionContext &context,
const int dropout_index) {
std::string pre_fix = "dropout";
std::string str_index = std::to_string(dropout_index);
if (dropout_index > 0) {
pre_fix = pre_fix + str_index + "_";
} else {
pre_fix = pre_fix + "_";
}
dropout_prob = context.Attr<float>(pre_fix + "rate");
auto &dropout_implementation =
context.Attr<std::string>(pre_fix + "implementation");
is_upscale_in_train = (dropout_implementation == "upscale_in_train");
is_test = context.Attr<bool>("is_test");
fix_seed = context.Attr<bool>(pre_fix + "fix_seed");
std::string str_seed = "Dropout";
if (dropout_index > 0) {
str_seed = str_seed + str_index + "Seed";
} else {
str_seed = str_seed + "Seed";
}
tensor_seed =
context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
if (tensor_seed) {
seed_val = *(tensor_seed->data<int>());
} else {
seed_val = fix_seed ? context.Attr<int>(pre_fix + "seed") : 0;
}
}
void initXPUDropoutParam(float dropout_prob_,
bool is_upscale_in_train_,
bool is_test_,
bool fix_seed_,
const Tensor *tensor_seed,
int seed_val_) {
dropout_prob = dropout_prob_;
is_upscale_in_train = is_upscale_in_train_;
is_test = is_test_;
fix_seed = fix_seed_;
if (tensor_seed) {
seed_val = *(tensor_seed->data<int>());
} else {
seed_val = fix_seed ? seed_val_ : 0;
}
}
void initXPUDropoutParam(const framework::ExecutionContext &context,
int dropout_index) {
std::string pre_fix = "dropout";
std::string str_index = std::to_string(dropout_index);
if (dropout_index > 0) {
pre_fix = pre_fix + str_index + "_";
} else {
pre_fix = pre_fix + "_";
}
dropout_prob = context.Attr<float>(pre_fix + "rate");
auto &dropout_implementation =
context.Attr<std::string>(pre_fix + "implementation");
is_upscale_in_train = (dropout_implementation == "upscale_in_train");
is_test = context.Attr<bool>("is_test");
fix_seed = context.Attr<bool>(pre_fix + "fix_seed");
std::string str_seed = "Dropout";
if (dropout_index > 0) {
str_seed = str_seed + str_index + "Seed";
} else {
str_seed = str_seed + "Seed";
}
tensor_seed =
context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
if (tensor_seed) {
seed_val = *(tensor_seed->data<int>());
} else {
seed_val = fix_seed ? context.Attr<int>(pre_fix + "seed") : 0;
}
}
};
/******************
* check is l3
*******************/
static bool is_in_l3(const void *addr) {
int64_t addr_int = (int64_t)addr;
int addr_int_high = addr_int >> 32;
return (addr_int_high == 0);
}
/*************************
* dropout
*************************/
template <typename T>
void Dropout(xpu::Context *xpu_ctx,
const T *x,
T *mask,
T *y,
const XPUDropoutParam &param,
int len) {
using XPUType = typename XPUTypeTrait<T>::Type;
int r = XPU_SUCCESS;
if (param.dropout_prob == 0.0f) {
r = xpu::copy(xpu_ctx,
reinterpret_cast<const XPUType *>(x),
reinterpret_cast<XPUType *>(y),
len);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
return;
}
if (!param.is_test) {
if (param.dropout_prob == 1.0f) {
r = xpu::constant(
xpu_ctx, reinterpret_cast<XPUType *>(y), len, XPUType(0));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
r = xpu::constant(
xpu_ctx, reinterpret_cast<XPUType *>(mask), len, XPUType(0));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
} else {
r = xpu::dropout(xpu_ctx,
reinterpret_cast<const XPUType *>(x),
reinterpret_cast<XPUType *>(y),
reinterpret_cast<XPUType *>(mask),
param.seed_val,
len,
param.is_upscale_in_train,
param.dropout_prob);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "dropout");
}
} else {
float scale = (param.is_upscale_in_train)
? (1.0)
: (static_cast<float>(1.0f - param.dropout_prob));
r = xpu::scale(xpu_ctx,
reinterpret_cast<const XPUType *>(x),
reinterpret_cast<XPUType *>(y),
len,
false,
scale,
0.0f);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
}
}
template <typename T>
void DropoutGrad(xpu::Context *xpu_ctx,
const T *dy,
const T *mask,
T *dx,
const XPUDropoutParam &param,
int len) {
using XPUType = typename XPUTypeTrait<T>::Type;
if (param.dropout_prob == 0.0f) {
int r = xpu::copy(xpu_ctx,
reinterpret_cast<const XPUType *>(dy),
reinterpret_cast<XPUType *>(dx),
len);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
return;
}
if (!param.is_upscale_in_train) {
int r = xpu::mul(xpu_ctx,
reinterpret_cast<const XPUType *>(dy),
reinterpret_cast<const XPUType *>(mask),
reinterpret_cast<XPUType *>(dx),
len);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
} else {
int r = xpu::dropout_grad(xpu_ctx,
reinterpret_cast<const XPUType *>(mask),
reinterpret_cast<const XPUType *>(dy),
reinterpret_cast<XPUType *>(dx),
param.dropout_prob,
len);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "dropout_grad");
}
}
} // namespace operators
} // namespace paddle
#endif
......@@ -704,6 +704,18 @@ XPUOpMap& get_kl2_ops() {
{"fused_gemm_epilogue_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"fused_attention",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"fused_attention_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"fused_feedforward",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"fused_feedforward_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
};
return s_xpu2_kernels;
......
......@@ -382,7 +382,8 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx,
const T* y,
T* out,
const XpuFcInfo& fcinfo,
float alpha) {
float alpha,
bool is_grad = false) {
using XPUType = typename XPUTypeTrait<T>::Type;
int fccal_type = FCCalcType<XPUType>();
......@@ -398,6 +399,12 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx,
};
auto fc_api = fc_api_list[fccal_type];
if (std::getenv("XPU_PADDLE_FC_GRAD_LOCAL") != nullptr) {
if (is_grad) {
fc_api = fc_api_list[2];
}
}
auto fc_batch_api = fc_batch_api_list[fccal_type];
int m = fcinfo.m;
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import sys
sys.path.append("..")
import paddle
import paddle.nn.functional as F
import paddle.incubate.nn.functional as incubate_f
from paddle.nn.layer.norm import LayerNorm
from paddle.nn.layer.common import Linear, Dropout
from paddle.nn.layer.transformer import _convert_attention_mask
from paddle import tensor
from paddle.fluid import layers
import unittest
from op_test_xpu import XPUOpTest
from paddle.fluid.framework import default_main_program
from xpu.get_test_cover_info import (
create_test_class,
get_xpu_op_support_types,
XPUOpTestWrapper,
)
default_main_program().random_seed = 42
class XPUTestFusedAttentionOp(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'fused_attention'
self.use_dynamic_create_class = False
class TestFusedAttentionOp(XPUOpTest):
def setUp(self):
self.config()
self.generate_input_data()
self.rtol = 1e-5
self.atol = 1e-3
if self.x_type == np.float16 or str(self.x_type) == "float16":
self.atol = 1e-1
paddle.set_default_dtype(self.x_type)
self.__class__.op_type = "fused_attention"
# use autograd to check grad in this unittest.
self.__class__.no_need_check_grad = True
self.q_proj = Linear(
self.embed_dim,
self.embed_dim,
self.weight_attr,
bias_attr=self.bias_attr,
)
self.k_proj = Linear(
self.kdim,
self.embed_dim,
self.weight_attr,
bias_attr=self.bias_attr,
)
self.v_proj = Linear(
self.vdim,
self.embed_dim,
self.weight_attr,
bias_attr=self.bias_attr,
)
self.out_proj = Linear(
self.embed_dim,
self.embed_dim,
self.weight_attr,
bias_attr=self.bias_attr,
)
paddle.set_default_dtype(np.float32)
self.norm1 = LayerNorm(self.embed_dim)
self.norm2 = LayerNorm(self.embed_dim)
paddle.set_default_dtype(self.x_type)
self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
def config(self):
self.x_type = self.in_type
self.attn_mask_type = np.float32
self.pre_layer_norm = True
self.has_attn_mask = False
self.training = True
self.batch_size = 8
self.query_length = 128
self.cache_length = 128
self.head_dim = 64
self.num_heads = 16
self.embed_dim = self.head_dim * self.num_heads
self.dropout_prob = 0.0
self.attn_dropout_prob = 0.0
self.weight_attr = None
self.bias_attr = None
self.kdim, self.vdim = self.embed_dim, self.embed_dim
self.key_length, self.value_length = (
self.query_length,
self.query_length,
)
def generate_input_data(self):
self.query = np.random.rand(
self.batch_size, self.query_length, self.embed_dim
).astype(self.x_type)
out_seq_len = self.key_length
if self.has_attn_mask:
# [B, n_head, seq_len, out_seq_len]
self.attn_mask = np.ones(
(
self.batch_size,
self.num_heads,
self.query_length,
out_seq_len,
),
dtype=self.attn_mask_type,
)
else:
self.attn_mask = None
self.key, self.value = self.query, self.query
self.dout = np.random.random(
(self.batch_size, self.query_length, self.embed_dim)
).astype(self.x_type)
def GetBaselineOut(self):
paddle.disable_static()
tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
if self.has_attn_mask:
attn_mask = paddle.to_tensor(
self.attn_mask, stop_gradient=False
)
else:
attn_mask = None
residual = tensor_query
ln1_out = tensor_query
if self.pre_layer_norm:
ln1_out = self.norm1(tensor_query)
q = self.q_proj(ln1_out)
q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3])
k = self.k_proj(ln1_out)
v = self.v_proj(ln1_out)
k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3])
v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul(
x=q_out * self.head_dim**-0.5, y=k_out, transpose_y=True
)
if attn_mask is not None:
attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
attn_mask_out = qk_out + attn_mask
softmax_out = F.softmax(attn_mask_out)
else:
softmax_out = F.softmax(qk_out)
if self.dropout_prob:
dropout_out = F.dropout(
softmax_out,
self.dropout_prob,
training=self.training,
mode="upscale_in_train",
)
# [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, head_dim]
qktv_out = tensor.matmul(dropout_out, v_out)
else:
qktv_out = tensor.matmul(softmax_out, v_out)
fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3])
out_linear_in = tensor.reshape(
x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]]
)
out = self.out_proj(out_linear_in)
residual_out = residual + self.dropout(out)
if not self.pre_layer_norm:
final_out = self.norm1(residual_out)
else:
final_out = residual_out
paddle.autograd.backward(
[final_out], [paddle.to_tensor(self.dout)], retain_graph=True
)
return final_out, tensor_query.grad
def GetFusedAttentionOut(self):
paddle.disable_static()
q_proj_weight = paddle.to_tensor(
self.q_proj.weight, stop_gradient=False
)
k_proj_weight = paddle.to_tensor(
self.k_proj.weight, stop_gradient=False
)
v_proj_weight = paddle.to_tensor(
self.v_proj.weight, stop_gradient=False
)
out_linear_weight = paddle.to_tensor(
self.out_proj.weight, stop_gradient=False
)
if self.bias_attr is False:
qkv_bias_tensor = None
out_linear_bias = None
else:
q_proj_bias = paddle.to_tensor(
self.q_proj.bias, stop_gradient=False
)
k_proj_bias = paddle.to_tensor(
self.k_proj.bias, stop_gradient=False
)
v_proj_bias = paddle.to_tensor(
self.v_proj.bias, stop_gradient=False
)
qkv_bias = np.concatenate(
(
q_proj_bias.numpy(),
k_proj_bias.numpy(),
v_proj_bias.numpy(),
)
)
qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim))
qkv_bias_tensor = paddle.to_tensor(
qkv_bias, stop_gradient=False
)
out_linear_bias = paddle.to_tensor(
self.out_proj.bias, stop_gradient=False
)
ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False)
ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False)
q_proj_weight = q_proj_weight.numpy().transpose((1, 0))
k_proj_weight = k_proj_weight.numpy().transpose((1, 0))
v_proj_weight = v_proj_weight.numpy().transpose((1, 0))
qkv_weight = np.concatenate(
(q_proj_weight, k_proj_weight, v_proj_weight)
)
qkv_weight = qkv_weight.reshape(
(3, self.num_heads, self.head_dim, self.embed_dim)
)
x = paddle.to_tensor(self.query, stop_gradient=False)
cache_kv = None
if self.has_attn_mask:
attn_mask = paddle.to_tensor(
self.attn_mask, stop_gradient=False
)
else:
attn_mask = None
qkv_weight_tensor = paddle.to_tensor(
qkv_weight, stop_gradient=False
)
epsilon = 1e-05
ln2_epsilon = 1e-05
if attn_mask is not None:
attn_mask = _convert_attention_mask(attn_mask, x.dtype)
final_out = incubate_f.fused_multi_head_attention(
x,
qkv_weight_tensor,
out_linear_weight,
self.pre_layer_norm,
ln1_scale,
ln1_bias,
ln2_scale,
ln2_bias,
epsilon,
qkv_bias_tensor,
out_linear_bias,
cache_kv,
attn_mask,
self.dropout_prob,
self.attn_dropout_prob,
ln2_epsilon,
)
paddle.autograd.backward(
[final_out], [paddle.to_tensor(self.dout)], retain_graph=True
)
return final_out, x.grad
def test_fused_attention_op(self):
final_out_ref, x_grad_ref = self.GetBaselineOut()
final_out, x_grad = self.GetFusedAttentionOut()
np.testing.assert_allclose(
final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol
)
np.testing.assert_allclose(
x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol
)
class TestFusedAttentionOpPreLn(TestFusedAttentionOp):
def config(self):
super().config()
self.pre_layer_norm = True
class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp):
def config(self):
super().config()
self.pre_layer_norm = True
self.has_attn_mask = False
support_types = get_xpu_op_support_types('fused_attention')
for stype in support_types:
create_test_class(globals(), XPUTestFusedAttentionOp, stype)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import sys
sys.path.append("..")
import paddle
from paddle.nn.layer import transformer
import paddle.nn.functional as F
import paddle.incubate.nn.functional as incubate_f
from paddle.nn.layer.norm import LayerNorm
from paddle.nn.layer.common import Linear, Dropout
import unittest
from op_test_xpu import XPUOpTest
from paddle.fluid.framework import default_main_program
from xpu.get_test_cover_info import (
create_test_class,
XPUOpTestWrapper,
)
class XPUTestFusedFFNOp(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'fused_feedforward'
self.use_dynamic_create_class = False
class TestFusedFFNOp(XPUOpTest):
def getDtype(self):
self.dtype = self.in_type
self.layer_norm_dtype = "float32"
def getShape(self):
self.batch_size = np.random.randint(1, 32)
self.query_length = np.random.randint(32, 128)
self.d_model = np.random.randint(32, 512)
self.dim_feedforward = np.random.randint(32, 512)
def getDiff(self):
self.rtol = 1e-2
self.atol = 1e-3
if self.dtype == np.float16 or self.dtype == "float16":
self.atol = 1e-1
def getActivation(self):
self.act_method = "gelu"
def getNormalizeBefore(self):
self.pre_layer_norm = False
def setUp(self):
paddle.disable_static()
self.__class__.op_type = "fused_feedforward"
# check grad in test_out_and_grad()
self.__class__.no_need_check_grad = True
self.getDtype()
self.getShape()
self.getDiff()
self.getActivation()
self.getNormalizeBefore()
paddle.set_default_dtype(self.dtype)
self.weight_attr = None
self.bias_attr = None
self.weight_attrs = transformer._convert_param_attr_to_list(
self.weight_attr, 2
)
self.bias_attrs = transformer._convert_param_attr_to_list(
self.bias_attr, 2
)
self.linear1 = Linear(
self.d_model,
self.dim_feedforward,
self.weight_attrs[1],
bias_attr=self.bias_attrs[1],
)
self.linear2 = Linear(
self.dim_feedforward,
self.d_model,
self.weight_attrs[1],
bias_attr=self.bias_attrs[1],
)
paddle.set_default_dtype(self.layer_norm_dtype)
self.norm1 = LayerNorm(self.d_model)
self.norm2 = LayerNorm(self.d_model)
paddle.set_default_dtype(self.dtype)
self.dropout1 = Dropout(0.0, mode="upscale_in_train")
self.dropout2 = Dropout(0.0, mode="upscale_in_train")
self.activation = getattr(F, self.act_method)
self.src = np.random.random(
(self.batch_size, self.query_length, self.d_model)
).astype(self.dtype)
self.dout = np.random.random(
(self.batch_size, self.query_length, self.d_model)
).astype(self.dtype)
def Base(self):
paddle.disable_static()
tensor_src = paddle.to_tensor(self.src, stop_gradient=False)
residual = tensor_src
if self.pre_layer_norm:
ln1_out = self.norm1(tensor_src)
linear2_out = self.linear2(
self.dropout1(self.activation(self.linear1(ln1_out)))
)
dropout2_out = residual + self.dropout2(linear2_out)
paddle.autograd.backward(
[dropout2_out], [paddle.to_tensor(self.dout)], True
)
return dropout2_out, tensor_src.grad
else:
linear2_out = self.linear2(
self.dropout1(self.activation(self.linear1(tensor_src)))
)
dropout2_out = residual + self.dropout2(linear2_out)
dropout2_out = self.norm2(dropout2_out)
paddle.autograd.backward(
[dropout2_out], [paddle.to_tensor(self.dout)], True
)
return dropout2_out, tensor_src.grad
def FusedFFN(self):
paddle.disable_static()
linear1_weight = paddle.to_tensor(
self.linear1.weight, stop_gradient=False
)
linear1_bias = paddle.to_tensor(
self.linear1.bias, stop_gradient=False
)
linear2_weight = paddle.to_tensor(
self.linear2.weight, stop_gradient=False
)
linear2_bias = paddle.to_tensor(
self.linear2.bias, stop_gradient=False
)
ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False)
ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False)
x = paddle.to_tensor(self.src, stop_gradient=False)
out = incubate_f.fused_feedforward(
x,
linear1_weight,
linear2_weight,
linear1_bias,
linear2_bias,
ln1_scale,
ln1_bias,
ln2_scale,
ln2_bias,
0.0,
0.0,
activation=self.act_method,
pre_layer_norm=self.pre_layer_norm,
)
paddle.autograd.backward([out], [paddle.to_tensor(self.dout)])
return out, x.grad
def test_out_and_grad(self):
default_main_program().random_seed = 42
base_out, base_grad = self.Base()
fused_out, fused_grad = self.FusedFFN()
np.testing.assert_allclose(
base_out.numpy(),
fused_out.numpy(),
rtol=self.rtol,
atol=self.atol,
)
np.testing.assert_allclose(
base_grad.numpy(),
fused_grad.numpy(),
rtol=self.rtol,
atol=self.atol,
)
class TestFusedFFNOpActivation(TestFusedFFNOp):
def getActivation(self):
self.act_method = "relu"
class TestFusedFFNOpNormalizeBefore(TestFusedFFNOp):
def getNormalizeBefore(self):
self.pre_layer_norm = True
def getShape(self):
self.batch_size = 1
self.query_length = 1
self.d_model = 8
self.dim_feedforward = 8
class APITestStaticFusedFFN(unittest.TestCase):
def test_static(self):
paddle.enable_static()
default_main_program().random_seed = 42
dtype = "float32"
layer_norm_dtype = "float32"
batch_size = 1
d_model = 8
dim_feedforward = 8
x = paddle.static.data(
name='x', shape=[batch_size, d_model, dim_feedforward], dtype=dtype
)
linear1_weight = paddle.static.data(
name='linear1_weight', shape=[d_model, dim_feedforward], dtype=dtype
)
linear1_bias = paddle.static.data(
name='linear1_bias', shape=[dim_feedforward], dtype=dtype
)
linear2_weight = paddle.static.data(
name='linear2_weight', shape=[dim_feedforward, d_model], dtype=dtype
)
linear2_bias = paddle.static.data(name='linear2_bias', shape=[d_model])
ln1_scale = paddle.static.data(name='ln1_scale', shape=[d_model])
ln1_bias = paddle.static.data(name='ln1_scale', shape=[d_model])
ln2_scale = paddle.static.data(name='ln2_scale', shape=[d_model])
ln2_bias = paddle.static.data(name='ln2_scale', shape=[d_model])
fused_out = incubate_f.fused_feedforward(
x,
linear1_weight,
linear2_weight,
linear1_bias,
linear2_bias,
ln1_scale,
ln1_bias,
ln2_scale,
ln2_bias,
0.0,
0.0,
activation="relu",
pre_layer_norm=False,
)
linear1_out = F.linear(x, linear1_weight, linear1_bias)
act_out = F.relu(linear1_out)
dropout1_out = F.dropout(x=act_out, p=0.0, training=False)
linear2_out = F.linear(dropout1_out, linear2_weight, linear2_bias)
dropout2_out = x + F.dropout(x=linear2_out, p=0.0, training=False)
ln_out = F.layer_norm(
dropout2_out,
normalized_shape=list([d_model]),
weight=ln2_scale,
bias=ln2_bias,
)
exe = paddle.static.Executor(paddle.XPUPlace(0))
x_data = np.random.random(
(batch_size, d_model, dim_feedforward)
).astype(dtype)
linear1_weight_data = np.random.random(
(d_model, dim_feedforward)
).astype(dtype)
linear1_bias_data = np.zeros((dim_feedforward)).astype(dtype)
linear2_weight_data = np.random.random(
(dim_feedforward, d_model)
).astype(dtype)
linear2_bias_data = np.zeros((d_model)).astype(dtype)
ln1_scale_data = np.ones((d_model)).astype(layer_norm_dtype)
ln1_bias_data = np.zeros((d_model)).astype(layer_norm_dtype)
ln2_scale_data = np.ones((d_model)).astype(layer_norm_dtype)
ln2_bias_data = np.zeros((d_model)).astype(layer_norm_dtype)
res_list = [fused_out, ln_out]
real_res = []
for res in res_list:
fetch = exe.run(
feed={
'x': x_data,
'linear1_weight': linear1_weight_data,
'linear1_bias': linear1_bias_data,
'linear2_weight': linear2_weight_data,
'linear2_bias': linear2_bias_data,
'ln1_scale': ln1_scale_data,
'ln1_bias': ln1_bias_data,
'ln2_scale': ln2_scale_data,
'ln2_bias': ln2_bias_data,
},
fetch_list=[res],
)
real_res.append(fetch)
np.testing.assert_allclose(
real_res[0], real_res[1], rtol=1e-05, atol=0.001
)
class TestFusedFFNOpError(unittest.TestCase):
def test_errors(self):
paddle.enable_static()
with paddle.static.program_guard(
paddle.static.Program(), paddle.static.Program()
):
def test_dtype():
x = paddle.static.data(
name='x', shape=[1, 10, 10], dtype="int32"
)
linear1_weight = paddle.static.data(
name='linear1_weight', shape=[1, 10, 10], dtype="float32"
)
linear2_weight = paddle.static.data(
name='linear2_weight', shape=[1, 10, 10], dtype="float32"
)
incubate_f.fused_feedforward(x, linear1_weight, linear2_weight)
self.assertRaises(TypeError, test_dtype)
def test_dropout_rate_type():
x = paddle.static.data(
name='x1', shape=[1, 10, 10], dtype="float32"
)
linear1_weight = paddle.static.data(
name='linear1_weight1', shape=[10, 10], dtype="float32"
)
linear2_weight = paddle.static.data(
name='linear2_weight1', shape=[10, 10], dtype="float32"
)
incubate_f.fused_feedforward(
x, linear1_weight, linear2_weight, dropout1_rate="a"
)
self.assertRaises(TypeError, test_dropout_rate_type)
def test_dropout_rate_value():
x = paddle.static.data(
name='x2', shape=[1, 10, 10], dtype="float32"
)
linear1_weight = paddle.static.data(
name='linear1_weight2', shape=[10, 10], dtype="float32"
)
linear2_weight = paddle.static.data(
name='linear2_weight2', shape=[10, 10], dtype="float32"
)
incubate_f.fused_feedforward(
x, linear1_weight, linear2_weight, dropout2_rate=-1
)
self.assertRaises(ValueError, test_dropout_rate_value)
def test_dropout_mode():
x = paddle.static.data(
name='x3', shape=[1, 10, 10], dtype="float32"
)
linear1_weight = paddle.static.data(
name='linear1_weight3', shape=[10, 10], dtype="float32"
)
linear2_weight = paddle.static.data(
name='linear2_weight3', shape=[10, 10], dtype="float32"
)
incubate_f.fused_feedforward(
x, linear1_weight, linear2_weight, mode='test'
)
self.assertRaises(ValueError, test_dropout_mode)
support_types = {"float32"} # get_xpu_op_support_types('fused_feedforward')
for stype in support_types:
create_test_class(globals(), XPUTestFusedFFNOp, stype)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册