diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 104298e037319c6fbbfc8da830543fe06eb4dcac..541e5afdf9b71e4f087adcc4fe58cacdc54f4f61 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -15,7 +15,8 @@ register_operators(EXCLUDES fusion_group_op fusion_gru_op fusion_lstm_op - fused_bn_add_activation_op) + fused_bn_add_activation_op + fused_transformer_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cc b/paddle/fluid/operators/fused/fused_transformer_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9e5fc42fc76dd1c0aa8a9e44fe522be548140288 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_transformer_op.cc @@ -0,0 +1,161 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_transformer_op.h" +#include + +namespace paddle { +namespace operators { + +// constructor and init +template +FusedTransformerEncoderLayer::FusedTransformerEncoderLayer( + int batch_size_, int max_seq_len_, int dim_embed_, int dim_feedforward_, + int num_head_, float dropout_, float act_dropout_, float attn_dropout_, + std::string act_method_, bool normalize_pre_or_post_) { + // configurations + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + dim_feedforward = dim_feedforward_; + num_head = num_head_; + head_size = dim_embed_ / num_head; + + dropout = dropout_; + act_dropout = act_dropout_; + attn_dropout = attn_dropout_; + + act_method = act_method_; + normalize_pre_or_post = normalize_pre_or_post_; + + // init attn + fused_attn = + new FusedAttention(batch_size, max_seq_len, dim_embed, num_head, + dropout, attn_dropout, normalize_pre_or_post); + + // init ffn + fused_ffn = + new FusedFFN(batch_size, max_seq_len, dim_embed, dim_feedforward_, + act_dropout, act_method, normalize_pre_or_post); +} + +// deconstructor +template +FusedTransformerEncoderLayer::~FusedTransformerEncoderLayer() { + delete fused_attn; + delete fused_ffn; +} + +// compute forward +template +void FusedTransformerEncoderLayer::ComputeForward(T* src, T* output) { + T* output_attn; // todo + + fused_attn->ComputeForward(src, output_attn); + fused_ffn->ComputeForward(output_attn, output); +} + +// compute backward +template +void FusedTransformerEncoderLayer::ComputeBackward() {} + +// constructor and init +template +FusedAttention::FusedAttention(int batch_size_, int max_seq_len_, + int dim_embed_, int num_head_, float dropout_, + float attn_dropout_, + bool normalize_pre_or_post_) { + // configurations + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + num_head = num_head_; + head_size = dim_embed_ / num_head; + + dropout = dropout_; + attn_dropout = attn_dropout_; + + normalize_pre_or_post = normalize_pre_or_post_; + + // init fmha + fmha = new FusedMHA(); +} + +// compute forward +template +void FusedAttention::ComputeForward(T* src, T* output) {} + +template +FusedAttention::~FusedAttention() { + delete fmha; +} + +// compute backward +template +void FusedAttention::ComputeBackward() {} + +// constructor and init +template +FusedFFN::FusedFFN(int batch_size_, int max_seq_len_, int dim_embed_, + int dim_feedforward_, float act_dropout_, + std::string act_method_, bool normalize_pre_or_post_) { + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + dim_feedforward = dim_feedforward_; + act_dropout = act_dropout_; + + act_method = act_method_; + normalize_pre_or_post = normalize_pre_or_post_; +} + +template +FusedFFN::~FusedFFN() {} + +// compute forward +template +void FusedFFN::ComputeForward(T* src, T* output) {} + +// compute backward +template +void FusedFFN::ComputeBackward() {} + +// init +template +FusedMHA::FusedMHA(int batch_size_, int max_seq_len_, int dim_embed_, + int num_head_, float dropout_, bool is_test_, + uint64_t seed_, uint64_t* seqlen_, uint64_t* cu_seqlen_) { + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + num_head = num_head_; + head_size = dim_embed_ / num_head; + + dropout = dropout_; + is_test = is_test_; + seed = seed_; + seqlen = seqlen_; + cu_seqlen = cu_seqlen_; +} + +// compute forward +template +void FusedMHA::ComputeForward(T* output, T* softmax_mask) {} + +// compute backward +template +void FusedMHA::ComputeBackward(const T* grad_output, T* softmax_mask, + T* grad_x) {} +} +} \ No newline at end of file diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cu b/paddle/fluid/operators/fused/fused_transformer_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..43bf3acdc6156bd8d469beb8f9989523a3c53388 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_transformer_op.cu @@ -0,0 +1,13 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ \ No newline at end of file diff --git a/paddle/fluid/operators/fused/fused_transformer_op.h b/paddle/fluid/operators/fused/fused_transformer_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2d2d390d243e5aeee6219f755dde53ba08c76e75 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_transformer_op.h @@ -0,0 +1,155 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace operators { + +template +class FusedMHA { + FusedMHA(int, int, int, int, float, bool, uint64_t, uint64_t*, uint64_t*); + ~FusedMHA(); + + void ComputeForward(T*, T*); + void ComputeBackward(const T*, T*, T*); + + private: + int batch_size; + int max_seq_len; + int dim_embed; + + int num_head; + int head_size; + + float dropout; + + bool is_test; + uint64_t seed; + + int32_t seqlen; + int32_t* cu_seqlen; +}; + +template +class FusedAttention { + public: + FusedAttention(int, int, int, int, float, float, bool); + ~FusedAttention(); + + void ComputeForward(T*, T*); + void ComputeBackward(); + + private: + FusedMHA* fmha; // fused multihead attention + + int batch_size; + int max_seq_len; + int dim_embed; + + int num_head; + int head_size; + + float dropout; + T attn_dropout; + + bool normalize_pre_or_post; + + // weights and bias used in attention + T* fattn_qkv_w; + T* fattn_qkv_b; + T* fattn_o_w; + T* fattn_o_b; + T* fattn_n_w; + T* fattn_n_b; + T* fattn_norm_w; + T* fattn_norm_b; + + T* fattn_grad_qkv_w; + T* fattn_grad_qkv_b; + T* fattn_grad_o_w; + T* fattn_grad_o_b; + T* fattn_grad_n_w; + T* fattn_grad_n_b; + T* fattn_grad_norm_w; + T* fattn_grad_norm_b; +}; + +template +class FusedFFN { + FusedFFN(int, int, int, int, float, std::string, bool); + ~FusedFFN(); + + void ComputeForward(T*, T*); + void ComputeBackward(); + + private: + int batch_size; + int max_seq_len; + int dim_embed; + int dim_feedforward; + + float attn_dropout; + float act_dropout; + + bool normalize_pre_or_post; + + std::string act_method; + + // weights and bias used in ffn + T* fffn_inter_w; + T* fffn_inter_b; + T* fffn_output_w; + T* fffn_output_b; + + T* fffn_grad_inter_w; + T* fffn_grad_inter_b; + T* fffn_grad_output_w; + T* fffn_grad_output_b; +}; + +template +class FusedTransformerEncoderLayer { + public: + FusedTransformerEncoderLayer(int, int, int, int, int, float, float, float, + std::string, bool); + ~FusedTransformerEncoderLayer(); + + void ComputeForward(T* src, T* output); + void ComputeBackward(); + + private: + FusedAttention* fused_attn; + FusedFFN* fused_ffn; + + int batch_size; + int max_seq_len; + int dim_embed; + int dim_feedforward; + + int num_head; + int head_size; + + float dropout; + float attn_dropout; + float act_dropout; + + bool normalize_pre_or_post; + + std::string act_method; +}; +} +}