未验证 提交 8a9bef70 编写于 作者: N Netpunk 提交者: GitHub

[PHI decoupling] migrate transpose_op.cu.h and gpu_utils.h to phi (#48286)

* migrate transpose_op.cu.h and gpu_utils.h

* format code style

* fix some problems

* format code

* reset tranpose_op.cc

* test commit

* recover transpose_op.h

* delete transpose_op.h

* adjust header files order in transpose_op.cc
上级 612b81c5
...@@ -16,12 +16,12 @@ limitations under the License. */ ...@@ -16,12 +16,12 @@ limitations under the License. */
#include "paddle/fluid/operators/dropout_impl.cu.h" #include "paddle/fluid/operators/dropout_impl.cu.h"
#include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h" #include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
#include "paddle/fluid/operators/transpose_op.cu.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/functors.h" #include "paddle/phi/kernels/funcs/functors.h"
#include "paddle/phi/kernels/funcs/transpose_functor.cu.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace paddle { namespace paddle {
...@@ -98,7 +98,7 @@ class FMHARef { ...@@ -98,7 +98,7 @@ class FMHARef {
// transpose with perm [2, 0, 3, 1, 4], // transpose with perm [2, 0, 3, 1, 4],
// output_shape: [3, bs, num_head, seq_len, head_dim] // output_shape: [3, bs, num_head, seq_len, head_dim]
std::vector<int> perm_1 = {2, 0, 3, 1, 4}; std::vector<int> perm_1 = {2, 0, 3, 1, 4};
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, qkv_input_tensor, perm_1, transpose_2_out_tensor); dev_ctx_, qkv_input_tensor, perm_1, transpose_2_out_tensor);
T* qkv_data = transpose_2_out_tensor->data<T>(); T* qkv_data = transpose_2_out_tensor->data<T>();
T* qk_out_data = qk_out_tensor->data<T>(); T* qk_out_data = qk_out_tensor->data<T>();
...@@ -254,7 +254,7 @@ class FMHARef { ...@@ -254,7 +254,7 @@ class FMHARef {
// transpose: [0, 2, 1, 3] // transpose: [0, 2, 1, 3]
// output shape: [batch_size, seq_len, num_heads, head_dim] // output shape: [batch_size, seq_len, num_heads, head_dim]
std::vector<int> perm_3 = {0, 2, 1, 3}; std::vector<int> perm_3 = {0, 2, 1, 3};
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor); dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor);
} }
...@@ -428,7 +428,7 @@ class FMHARef { ...@@ -428,7 +428,7 @@ class FMHARef {
// transpose: [0, 2, 1, 3] // transpose: [0, 2, 1, 3]
// output shape: [batch_size, seq_len, num_heads, head_dim] // output shape: [batch_size, seq_len, num_heads, head_dim]
std::vector<int> perm_3 = {0, 2, 1, 3}; std::vector<int> perm_3 = {0, 2, 1, 3};
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor); dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor);
} }
...@@ -470,7 +470,7 @@ class FMHARef { ...@@ -470,7 +470,7 @@ class FMHARef {
// transpose bw // transpose bw
std::vector<int> perm_3 = {0, 2, 1, 3}; std::vector<int> perm_3 = {0, 2, 1, 3};
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor); dev_ctx_, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor);
// recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) = // recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) =
...@@ -648,7 +648,7 @@ class FMHARef { ...@@ -648,7 +648,7 @@ class FMHARef {
// transpose bw // transpose bw
std::vector<int> perm_1 = {1, 3, 0, 2, 4}; std::vector<int> perm_1 = {1, 3, 0, 2, 4};
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, *transpose_2_out_grad_tensor, perm_1, qkv_input_grad_tensor); dev_ctx_, *transpose_2_out_grad_tensor, perm_1, qkv_input_grad_tensor);
} }
......
...@@ -14,11 +14,11 @@ limitations under the License. */ ...@@ -14,11 +14,11 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/transpose_op.cu.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/funcs/transpose_functor.cu.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace paddle { namespace paddle {
...@@ -626,9 +626,12 @@ class FMHAGateRef { ...@@ -626,9 +626,12 @@ class FMHAGateRef {
phi::DenseTensor* k_transpose_out, phi::DenseTensor* k_transpose_out,
phi::DenseTensor* v_transpose_out) { phi::DenseTensor* v_transpose_out) {
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, q_out, perm, q_transpose_out); phi::funcs::TransposeGPUKernelDriver<T>(
TransposeGPUKernelDriver<T>(dev_ctx_, k_out, perm, k_transpose_out); dev_ctx_, q_out, perm, q_transpose_out);
TransposeGPUKernelDriver<T>(dev_ctx_, v_out, perm, v_transpose_out); phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, k_out, perm, k_transpose_out);
phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, v_out, perm, v_transpose_out);
} }
void ComputeQKVTransposeBackward(const phi::DenseTensor& q_transpose_out_grad, void ComputeQKVTransposeBackward(const phi::DenseTensor& q_transpose_out_grad,
...@@ -638,11 +641,11 @@ class FMHAGateRef { ...@@ -638,11 +641,11 @@ class FMHAGateRef {
phi::DenseTensor* k_out_grad, phi::DenseTensor* k_out_grad,
phi::DenseTensor* v_out_grad) { phi::DenseTensor* v_out_grad) {
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, q_transpose_out_grad, perm, q_out_grad); dev_ctx_, q_transpose_out_grad, perm, q_out_grad);
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, k_transpose_out_grad, perm, k_out_grad); dev_ctx_, k_transpose_out_grad, perm, k_out_grad);
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, v_transpose_out_grad, perm, v_out_grad); dev_ctx_, v_transpose_out_grad, perm, v_out_grad);
} }
...@@ -651,14 +654,15 @@ class FMHAGateRef { ...@@ -651,14 +654,15 @@ class FMHAGateRef {
void ComputeQKVTransposeForward(const phi::DenseTensor& qkv_out, void ComputeQKVTransposeForward(const phi::DenseTensor& qkv_out,
phi::DenseTensor* qkv_transpose_out) { phi::DenseTensor* qkv_transpose_out) {
std::vector<int> perm = {3, 0, 1, 4, 2, 5}; std::vector<int> perm = {3, 0, 1, 4, 2, 5};
TransposeGPUKernelDriver<T>(dev_ctx_, qkv_out, perm, qkv_transpose_out); phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, qkv_out, perm, qkv_transpose_out);
} }
void ComputeQKVTransposeBackward( void ComputeQKVTransposeBackward(
const phi::DenseTensor& qkv_transpose_out_grad, const phi::DenseTensor& qkv_transpose_out_grad,
phi::DenseTensor* qkv_out_grad) { phi::DenseTensor* qkv_out_grad) {
std::vector<int> perm = {1, 2, 4, 0, 3, 5}; std::vector<int> perm = {1, 2, 4, 0, 3, 5};
TransposeGPUKernelDriver<T>( phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad); dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad);
} }
...@@ -667,13 +671,14 @@ class FMHAGateRef { ...@@ -667,13 +671,14 @@ class FMHAGateRef {
void ComputeQKTVTransposeForward(const phi::DenseTensor& qktv_out, void ComputeQKTVTransposeForward(const phi::DenseTensor& qktv_out,
phi::DenseTensor* fmha_out) { phi::DenseTensor* fmha_out) {
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, qktv_out, perm, fmha_out); phi::funcs::TransposeGPUKernelDriver<T>(dev_ctx_, qktv_out, perm, fmha_out);
} }
void ComputeQKTVTransposeBackward(const phi::DenseTensor& fmha_out_grad, void ComputeQKTVTransposeBackward(const phi::DenseTensor& fmha_out_grad,
phi::DenseTensor* qktv_out_grad) { phi::DenseTensor* qktv_out_grad) {
std::vector<int> perm = {0, 1, 3, 2, 4}; std::vector<int> perm = {0, 1, 3, 2, 4};
TransposeGPUKernelDriver<T>(dev_ctx_, fmha_out_grad, perm, qktv_out_grad); phi::funcs::TransposeGPUKernelDriver<T>(
dev_ctx_, fmha_out_grad, perm, qktv_out_grad);
} }
// qk_out = qk_out + nonbatched_bias + src_mask // qk_out = qk_out + nonbatched_bias + src_mask
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -21,6 +19,8 @@ limitations under the License. */ ...@@ -21,6 +19,8 @@ limitations under the License. */
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -23,8 +23,8 @@ limitations under the License. */ ...@@ -23,8 +23,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -251,7 +251,7 @@ static void UniqueDim(const framework::ExecutionContext& context, ...@@ -251,7 +251,7 @@ static void UniqueDim(const framework::ExecutionContext& context,
in_trans.Resize(in_trans_dims); in_trans.Resize(in_trans_dims);
in_trans.mutable_data<InT>(context.GetPlace()); in_trans.mutable_data<InT>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
TransCompute<DeviceContext, InT>( phi::funcs::TransCompute<DeviceContext, InT>(
in.dims().size(), dev_ctx, in, &in_trans, permute); in.dims().size(), dev_ctx, in, &in_trans, permute);
// reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
...@@ -315,7 +315,7 @@ static void UniqueDim(const framework::ExecutionContext& context, ...@@ -315,7 +315,7 @@ static void UniqueDim(const framework::ExecutionContext& context,
out->Resize(phi::make_ddim(out_trans_dims_vec)); out->Resize(phi::make_ddim(out_trans_dims_vec));
out->mutable_data<InT>(context.GetPlace()); out->mutable_data<InT>(context.GetPlace());
concat_functor(dev_ctx, input_unbind, 0, &out_trans); concat_functor(dev_ctx, input_unbind, 0, &out_trans);
TransCompute<DeviceContext, InT>( phi::funcs::TransCompute<DeviceContext, InT>(
out_trans.dims().size(), dev_ctx, out_trans, out, permute); out_trans.dims().size(), dev_ctx, out_trans, out, permute);
if (return_inverse) { if (return_inverse) {
......
...@@ -18,11 +18,11 @@ ...@@ -18,11 +18,11 @@
#include <array> #include <array>
#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/core/enforce.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
namespace paddle { namespace phi {
namespace framework { namespace funcs {
template <typename T, int Size, T DefaultValue> template <typename T, int Size, T DefaultValue>
struct DeviceArray { struct DeviceArray {
...@@ -110,16 +110,16 @@ IntType CeilOrFloor(IntType x, IntType deviser) { ...@@ -110,16 +110,16 @@ IntType CeilOrFloor(IntType x, IntType deviser) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
deviser, deviser,
0, 0,
platform::errors::InvalidArgument("deviser should be greater than 0, " phi::errors::InvalidArgument("deviser should be greater than 0, "
"but received is:%d", "but received is:%d",
deviser)); deviser));
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
x, x,
0, 0,
platform::errors::InvalidArgument("input should be greater than 0, " phi::errors::InvalidArgument("input should be greater than 0, "
"but received is:%d", "but received is:%d",
x)); x));
const IntType round_to_zero = x / deviser; const IntType round_to_zero = x / deviser;
const IntType inte_result = round_to_zero * deviser; const IntType inte_result = round_to_zero * deviser;
...@@ -140,5 +140,5 @@ IntType CeilOrFloor(IntType x, IntType deviser) { ...@@ -140,5 +140,5 @@ IntType CeilOrFloor(IntType x, IntType deviser) {
} }
} }
} // namespace framework } // namespace funcs
} // namespace paddle } // namespace phi
...@@ -14,20 +14,18 @@ limitations under the License. */ ...@@ -14,20 +14,18 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/gpu_utils.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_utils.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
#include "paddle/phi/kernels/primitive/datamover_primitives.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using Dim3 = framework::Dim3;
using Index3 = framework::Index3;
struct EqualTo { struct EqualTo {
constexpr bool operator()(int a, int b) const { return a == b; } constexpr bool operator()(int a, int b) const { return a == b; }
...@@ -118,8 +116,8 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, ...@@ -118,8 +116,8 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input,
}; };
// Converts block idx to tile index, each block process a tile // Converts block idx to tile index, each block process a tile
Index3 input_block_tile_index = framework::ConvertTensorIndex<IndexType>( Index3 input_block_tile_index =
blockIdx.x, tile_aligned_input_dim); ConvertTensorIndex<IndexType>(blockIdx.x, tile_aligned_input_dim);
// Compute real index align to tile:0, 32, 64... // Compute real index align to tile:0, 32, 64...
Index3 block_tile_index_in_input = { Index3 block_tile_index_in_input = {
...@@ -130,8 +128,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, ...@@ -130,8 +128,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input,
// Compute block flat index against input dims. // Compute block flat index against input dims.
IndexType input_origin_block_flat_index = IndexType input_origin_block_flat_index =
framework::FlatTensorIndex<IndexType>(block_tile_index_in_input, FlatTensorIndex<IndexType>(block_tile_index_in_input, input_dims);
input_dims);
bool full_tile = true; bool full_tile = true;
IndexType tile_width = TileY; IndexType tile_width = TileY;
...@@ -193,8 +190,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, ...@@ -193,8 +190,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input,
}; };
IndexType output_origin_block_flat_index = IndexType output_origin_block_flat_index =
framework::FlatTensorIndex<IndexType>(block_tile_index_in_output, FlatTensorIndex<IndexType>(block_tile_index_in_output, output_dims);
output_dims);
constexpr IndexType out_effective_thread_num = NumThreads / TileX * TileX; constexpr IndexType out_effective_thread_num = NumThreads / TileX * TileX;
...@@ -230,13 +226,13 @@ bool SelectProperTileSize(std::vector<std::pair<int, int>>* tiles) { ...@@ -230,13 +226,13 @@ bool SelectProperTileSize(std::vector<std::pair<int, int>>* tiles) {
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
TSIZE, TSIZE,
16, 16,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"The tile size should smaller than 16, but received is:%d.", TSIZE)); "The tile size should smaller than 16, but received is:%d.", TSIZE));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(TSIZE & (TSIZE - 1)), (TSIZE & (TSIZE - 1)),
0, 0,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"Data types should be powers of 2, but reived size is:%d.", TSIZE)); "Data types should be powers of 2, but reived size is:%d.", TSIZE));
const int kMaxLongSideLen = 1024; const int kMaxLongSideLen = 1024;
...@@ -316,7 +312,7 @@ struct NarrowDims2TransposeDispatch { ...@@ -316,7 +312,7 @@ struct NarrowDims2TransposeDispatch {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(tile_long & (tile_long - 1)), (tile_long & (tile_long - 1)),
0, 0,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"The length of the longer side of the tile should be power of 2." "The length of the longer side of the tile should be power of 2."
" But received value is:%d.", " But received value is:%d.",
tile_long)); tile_long));
...@@ -381,7 +377,7 @@ struct NarrowDims2TransposeDispatch< ...@@ -381,7 +377,7 @@ struct NarrowDims2TransposeDispatch<
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(tile_long & (tile_long - 1)), (tile_long & (tile_long - 1)),
0, 0,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"The length of the longer side of the tile should be power of 2." "The length of the longer side of the tile should be power of 2."
" But received value is:%d.", " But received value is:%d.",
tile_long)); tile_long));
...@@ -431,7 +427,7 @@ struct NarrowDims2TransposeDispatch< ...@@ -431,7 +427,7 @@ struct NarrowDims2TransposeDispatch<
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(tile_long & (tile_long - 1)), (tile_long & (tile_long - 1)),
0, 0,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"The length of the longer side of the tile should be power of 2," "The length of the longer side of the tile should be power of 2,"
" but received is:%d.", " but received is:%d.",
tile_long)); tile_long));
...@@ -459,7 +455,7 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, ...@@ -459,7 +455,7 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d,
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ret, ret,
true, true,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"SelectProperTileSize should return true, but return value is:%d.", "SelectProperTileSize should return true, but return value is:%d.",
ret)); ret));
...@@ -475,12 +471,12 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, ...@@ -475,12 +471,12 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d,
// to find least wasted threads, which means we need to find tile // to find least wasted threads, which means we need to find tile
// can split input properly, in another words: num_wasted_threads=0. // can split input properly, in another words: num_wasted_threads=0.
int num_wasted_threads = int num_wasted_threads =
input_long_edge - framework::CeilOrFloor<int, false>( input_long_edge -
input_long_edge, proposed_tile_long_edge) * CeilOrFloor<int, false>(input_long_edge, proposed_tile_long_edge) *
proposed_tile_long_edge; proposed_tile_long_edge;
int num_full_tiles = framework::CeilOrFloor<int, false>( int num_full_tiles =
input_long_edge, proposed_tile_long_edge); CeilOrFloor<int, false>(input_long_edge, proposed_tile_long_edge);
float cost = num_wasted_threads; float cost = num_wasted_threads;
...@@ -514,8 +510,8 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, ...@@ -514,8 +510,8 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d,
// Here finally get proper long X short tile size. // Here finally get proper long X short tile size.
Dim3 input_dims_aligned = { Dim3 input_dims_aligned = {
input_dims[0], input_dims[0],
framework::CeilOrFloor<int, true>(input_dims[1], select_tile_size_i), CeilOrFloor<int, true>(input_dims[1], select_tile_size_i),
framework::CeilOrFloor<int, true>(input_dims[2], select_tile_size_j), CeilOrFloor<int, true>(input_dims[2], select_tile_size_j),
}; };
IndexType total_tiles_count = input_dims_aligned[0]; IndexType total_tiles_count = input_dims_aligned[0];
...@@ -549,7 +545,7 @@ __global__ void TransposeSimpleKernel(IndexType nthreads, ...@@ -549,7 +545,7 @@ __global__ void TransposeSimpleKernel(IndexType nthreads,
CUDA_KERNEL_LOOP_TYPE(output_index, nthreads, IndexType) { CUDA_KERNEL_LOOP_TYPE(output_index, nthreads, IndexType) {
Index3 output_tensor_index = Index3 output_tensor_index =
framework::ConvertTensorIndex<IndexType>(output_index, output_dims); ConvertTensorIndex<IndexType>(output_index, output_dims);
Index3 input_tensor_index; Index3 input_tensor_index;
input_tensor_index[0] = output_tensor_index[pos0]; input_tensor_index[0] = output_tensor_index[pos0];
...@@ -557,7 +553,7 @@ __global__ void TransposeSimpleKernel(IndexType nthreads, ...@@ -557,7 +553,7 @@ __global__ void TransposeSimpleKernel(IndexType nthreads,
input_tensor_index[2] = output_tensor_index[pos2]; input_tensor_index[2] = output_tensor_index[pos2];
IndexType input_index = IndexType input_index =
framework::FlatTensorIndex<IndexType>(input_tensor_index, input_dims); FlatTensorIndex<IndexType>(input_tensor_index, input_dims);
output[output_index] = input[input_index]; output[output_index] = input[input_index];
} }
...@@ -585,8 +581,8 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, ...@@ -585,8 +581,8 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d,
Dim3 input_dims_aligned = { Dim3 input_dims_aligned = {
input_dims[0], input_dims[0],
framework::CeilOrFloor<int, true>(input_dims[1], kTileSize), CeilOrFloor<int, true>(input_dims[1], kTileSize),
framework::CeilOrFloor<int, true>(input_dims[2], kTileSize), CeilOrFloor<int, true>(input_dims[2], kTileSize),
}; };
IndexType total_tiles_count = input_dims_aligned[0]; IndexType total_tiles_count = input_dims_aligned[0];
...@@ -653,13 +649,13 @@ struct SwapDim0And2InTranspose { ...@@ -653,13 +649,13 @@ struct SwapDim0And2InTranspose {
// This function is to combine dimension. fox example: // This function is to combine dimension. fox example:
// (0, 1, 3, 2) --> (0, 2, 1) // (0, 1, 3, 2) --> (0, 2, 1)
inline void CombineTransposeDim3(const framework::DDim& shape, inline void CombineTransposeDim3(const DDim& shape,
const std::vector<int>& perm, const std::vector<int>& perm,
std::vector<int>* new_perm, std::vector<int>* new_perm,
framework::DDim* new_dims) { DDim* new_dims) {
PADDLE_ENFORCE_EQ(shape.size(), PADDLE_ENFORCE_EQ(shape.size(),
perm.size(), perm.size(),
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
" shape should have the save dim with perm, but" " shape should have the save dim with perm, but"
" received shape size is:%d, perm size is:%d.", " received shape size is:%d, perm size is:%d.",
shape.size(), shape.size(),
...@@ -717,7 +713,7 @@ struct TransposeSimple { ...@@ -717,7 +713,7 @@ struct TransposeSimple {
phi::DenseTensor* out) { phi::DenseTensor* out) {
// First reduce the dimensions of the input tensor if possible. // First reduce the dimensions of the input tensor if possible.
std::vector<int> new_perm; std::vector<int> new_perm;
framework::DDim new_dims; DDim new_dims;
CombineTransposeDim3(in.dims(), perm, &new_perm, &new_dims); CombineTransposeDim3(in.dims(), perm, &new_perm, &new_dims);
// Only use tile copy GPU kernel when dimension is 2 or 3. // Only use tile copy GPU kernel when dimension is 2 or 3.
...@@ -796,7 +792,7 @@ class IdxHelper<N, uint32_t> { ...@@ -796,7 +792,7 @@ class IdxHelper<N, uint32_t> {
explicit IdxHelper(const uint32_t* dims) { explicit IdxHelper(const uint32_t* dims) {
for (int i = N - 1; i >= 0; --i) { for (int i = N - 1; i >= 0; --i) {
uint32_t value = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1; uint32_t value = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1;
divmoder_[i] = paddle::platform::FastDivMod(value); divmoder_[i] = phi::kps::details::FastDivMod(value);
stride_[i] = value; stride_[i] = value;
} }
} }
...@@ -817,7 +813,7 @@ class IdxHelper<N, uint32_t> { ...@@ -817,7 +813,7 @@ class IdxHelper<N, uint32_t> {
private: private:
uint32_t stride_[N]; uint32_t stride_[N];
paddle::platform::FastDivMod divmoder_[N]; phi::kps::details::FastDivMod divmoder_[N];
}; };
// Transform index between memory offset and shape coodinate. // Transform index between memory offset and shape coodinate.
...@@ -1188,8 +1184,8 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx, ...@@ -1188,8 +1184,8 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
ret = TransposeSimple<T>::run(ctx, in, perm, out); ret = TransposeSimple<T>::run(ctx, in, perm, out);
} }
if (!ret) { if (!ret) {
auto* tuner = auto* tuner = phi::autotune::MakeTransposeTuner<T>(
phi::autotune::MakeTransposeTuner<T>(TransCompute<phi::GPUContext, T>); funcs::TransCompute<phi::GPUContext, T>);
tuner->AddCallBack(PermuteAndTranspose<phi::GPUContext, T>); tuner->AddCallBack(PermuteAndTranspose<phi::GPUContext, T>);
size_t key = phi::autotune::TransposeKey( size_t key = phi::autotune::TransposeKey(
...@@ -1208,5 +1204,5 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx, ...@@ -1208,5 +1204,5 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
} }
} }
} // namespace operators } // namespace funcs
} // namespace paddle } // namespace phi
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -16,57 +16,15 @@ limitations under the License. */ ...@@ -16,57 +16,15 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
enum { kTransposeMKLDNNFP32 = 1, kTransposeMKLDNNINT8 = 2 }; enum { kTransposeMKLDNNFP32 = 1, kTransposeMKLDNNINT8 = 2 };
template <typename DeviceContext, typename T>
inline void TransCompute(const int dim,
const DeviceContext& dev_ctx,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const std::vector<int>& axis) {
switch (dim) {
case 0:
phi::Copy<DeviceContext>(dev_ctx, in, dev_ctx.GetPlace(), false, out);
break;
case 1:
phi::funcs::Transpose<DeviceContext, T, 1> trans1;
trans1(dev_ctx, in, out, axis);
break;
case 2:
phi::funcs::Transpose<DeviceContext, T, 2> trans2;
trans2(dev_ctx, in, out, axis);
break;
case 3:
phi::funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(dev_ctx, in, out, axis);
break;
case 4:
phi::funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(dev_ctx, in, out, axis);
break;
case 5:
phi::funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(dev_ctx, in, out, axis);
break;
case 6:
phi::funcs::Transpose<DeviceContext, T, 6> trans6;
trans6(dev_ctx, in, out, axis);
break;
default:
// for dim >= 7 situation
phi::funcs::TransposeNormal<DeviceContext, T> trans_normal;
trans_normal(dev_ctx, in, out, axis);
}
}
enum PermuteType { enum PermuteType {
kCopy = 1, kCopy = 1,
kTranspose = 2, kTranspose = 2,
...@@ -227,5 +185,5 @@ class TranposeTypeClassifier { ...@@ -227,5 +185,5 @@ class TranposeTypeClassifier {
} }
}; };
} // namespace operators } // namespace funcs
} // namespace paddle } // namespace phi
...@@ -16,12 +16,12 @@ ...@@ -16,12 +16,12 @@
#include <vector> #include <vector>
#include "paddle/fluid/operators/transpose_op.cu.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/transpose_functor.cu.h"
#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" #include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
namespace phi { namespace phi {
...@@ -38,7 +38,7 @@ void TransposeKernel(const Context& ctx, ...@@ -38,7 +38,7 @@ void TransposeKernel(const Context& ctx,
phi::Copy<Context>(ctx, x, ctx.GetPlace(), false, out); phi::Copy<Context>(ctx, x, ctx.GetPlace(), false, out);
return; return;
} }
paddle::operators::TransposeGPUKernelDriver<T>(ctx, x, axis, out); phi::funcs::TransposeGPUKernelDriver<T>(ctx, x, axis, out);
} }
} // namespace phi } // namespace phi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册