未验证 提交 21aa3adc 编写于 作者: 王明冬 提交者: GitHub

move fc_functor from fluid to phi.test=develop (#41856)

上级 f3753b7f
......@@ -166,7 +166,7 @@ lod_tensor maxouting unpooling pooling lod_rank_table context_project
sequence_pooling executor device_memory_aligment generator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
......
......@@ -14,10 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
namespace paddle {
namespace operators {
......@@ -377,7 +377,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
// x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx, total_T, 1, M, x_data, atten_w_data, atted_x_data,
atten_b_data);
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
namespace paddle {
namespace operators {
......@@ -80,7 +80,7 @@ class FCOpKernel : public framework::OpKernel<T> {
T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx, M, w_dims1, w_dims0, input_data, w_data, output_data,
bias ? bias->data<T>() : NULL, with_relu, padding_weights);
}
......
......@@ -18,8 +18,8 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
......@@ -298,7 +298,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data,
bias ? bias->data<T>() : nullptr);
......@@ -370,7 +370,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
if (M > D3) {
fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data,
bias ? bias->data<T>() : nullptr);
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
......@@ -346,7 +346,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx, total_T, D4, M, x_data, wx_data, xx_data, bias->data<T>());
int xx_offset = D4;
......@@ -424,7 +424,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
if (M > D4) {
fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data<T>());
to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h"
#include <algorithm> // for min, max
#include <string>
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
namespace paddle {
namespace operators {
......@@ -244,7 +244,7 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
}
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx, x_dims[0], w_dims[1], w_dims[0], col_data, w_data, y_data,
b_data, true);
}
......
......@@ -14,10 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
#include <string>
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
namespace paddle {
namespace operators {
......@@ -212,7 +212,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
math::FCFunctor<DeviceContext, T> fc;
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx, total_T, D, M0, ref_in_data, w_data, out_data,
b ? b->data<T>() : NULL);
w_data = w_data + M0 * D;
......
......@@ -18,8 +18,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
......
......@@ -36,7 +36,6 @@ if (WITH_ASCEND_CL)
else()
math_library(beam_search DEPS math_function)
endif()
math_library(fc DEPS blas jit_kernel_helper)
math_library(matrix_bit_code)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class FCFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context, const int M,
const int N, const int K, const T* X, const T* W, T* Y,
const T* B = nullptr, bool relu = false,
bool padding_weights = false) {
auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
framework::Tensor Y1;
T* Y1_data = nullptr;
if (padding_weights) {
const int NN = N + 4;
const int KK = K + 4;
framework::Tensor X1;
T* X1_data = X1.mutable_data<T>({M * KK}, platform::CPUPlace());
Y1_data = Y1.mutable_data<T>({M * (N + 4)}, platform::CPUPlace());
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < M; i++) {
memcpy(X1_data + i * KK, X + i * K, K * sizeof(T));
}
blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X1_data, KK, W, NN,
static_cast<T>(0.0), Y1_data, NN);
} else {
blas.MatMul(M, N, K, X, W, Y);
}
if (B == NULL) {
if (padding_weights) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < M; i++) {
memcpy(Y + i * N, Y1_data + i * (N + 4), N * sizeof(T));
}
}
PADDLE_ENFORCE_EQ(relu, false,
platform::errors::PermissionDenied(
"When bias is NULL, relu can not be true."));
return;
}
auto compute =
relu
? jit::KernelFuncs<jit::VAddReluTuple<T>,
platform::CPUPlace>::Cache()
.At(N)
: jit::KernelFuncs<jit::VAddTuple<T>, platform::CPUPlace>::Cache()
.At(N);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < M; i++) {
T* dst = Y + i * N;
T* src = (padding_weights) ? Y1_data + i * (N + 4) : dst;
compute(B, src, dst, N);
}
}
};
template class FCFunctor<platform::CPUDeviceContext, float>;
template class FCFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -5,6 +5,7 @@ add_subdirectory(detail)
math_library(deformable_conv_functor DEPS dense_tensor)
math_library(concat_and_split_functor DEPS dense_tensor)
math_library(fc_functor DEPS blas jit_kernel_helper)
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions)
math_library(math_function DEPS blas dense_tensor tensor)
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace phi {
namespace funcs {
template <typename DeviceContext, typename T>
void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
const int M,
const int N,
const int K,
const T* X,
const T* W,
T* Y,
const T* B,
bool relu,
bool padding_weights) {
auto blas = GetBlas<DeviceContext, T>(context);
paddle::framework::Tensor Y1;
T* Y1_data = nullptr;
if (padding_weights) {
const int NN = N + 4;
const int KK = K + 4;
paddle::framework::Tensor X1;
T* X1_data = X1.mutable_data<T>({M * KK}, paddle::platform::CPUPlace());
Y1_data = Y1.mutable_data<T>({M * (N + 4)}, paddle::platform::CPUPlace());
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < M; i++) {
memcpy(X1_data + i * KK, X + i * K, K * sizeof(T));
}
blas.GEMM(false,
false,
M,
N,
K,
static_cast<T>(1.0),
X1_data,
KK,
W,
NN,
static_cast<T>(0.0),
Y1_data,
NN);
} else {
blas.MatMul(M, N, K, X, W, Y);
}
if (B == NULL) {
if (padding_weights) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < M; i++) {
memcpy(Y + i * N, Y1_data + i * (N + 4), N * sizeof(T));
}
}
PADDLE_ENFORCE_EQ(
relu,
false,
errors::PermissionDenied("When bias is NULL, relu can not be true."));
return;
}
auto compute = relu
? paddle::operators::jit::KernelFuncs<
paddle::operators::jit::VAddReluTuple<T>,
paddle::platform::CPUPlace>::Cache()
.At(N)
: paddle::operators::jit::KernelFuncs<
paddle::operators::jit::VAddTuple<T>,
paddle::platform::CPUPlace>::Cache()
.At(N);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < M; i++) {
T* dst = Y + i * N;
T* src = (padding_weights) ? Y1_data + i * (N + 4) : dst;
compute(B, src, dst, N);
}
}
template class FCFunctor<paddle::platform::CPUDeviceContext, float>;
template class FCFunctor<paddle::platform::CPUDeviceContext, double>;
template class FCFunctor<CPUContext, float>;
template class FCFunctor<CPUContext, double>;
} // namespace funcs
} // namespace phi
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
namespace paddle {
namespace operators {
namespace math {
namespace phi {
namespace funcs {
template <typename T>
struct FcTypeTraits;
......@@ -74,60 +74,76 @@ __global__ void InplaceAddReluKernel(const int N, const T* bias, T* data) {
}
}
template <typename T>
class FCFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context, const int M,
const int N, const int K, const T* X, const T* W, T* Y,
const T* B = nullptr, bool relu = false,
bool padding_weights = false) {
PADDLE_ENFORCE_EQ(
padding_weights, false,
platform::errors::PermissionDenied(
"Weight padding in fc can not be used in GPU scope."));
auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X, K, W, N,
static_cast<T>(0.0), Y, N);
if (B == NULL) {
return;
}
template <typename DeviceContext, typename T>
void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
const int M,
const int N,
const int K,
const T* X,
const T* W,
T* Y,
const T* B,
bool relu,
bool padding_weights) {
PADDLE_ENFORCE_EQ(padding_weights,
false,
errors::PermissionDenied(
"Weight padding in fc can not be used in GPU scope."));
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
blas.GEMM(false,
false,
M,
N,
K,
static_cast<T>(1.0),
X,
K,
W,
N,
static_cast<T>(0.0),
Y,
N);
if (B == NULL) {
return;
}
// M * N
if (N % 4 == 0) {
const int threads = 256;
const int num = M * N / 4;
const int blocks = (num + threads - 1) / threads;
typedef typename FcTypeTraits<T>::Type trans_type;
auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(B);
auto* data_ptr_v4 = reinterpret_cast<trans_type*>(Y);
if (relu) {
bias_relu_v4<trans_type,
true><<<blocks, threads, 0, context.stream()>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
} else {
bias_relu_v4<trans_type,
false><<<blocks, threads, 0, context.stream()>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
}
// M * N
if (N % 4 == 0) {
const int threads = 256;
const int num = M * N / 4;
const int blocks = (num + threads - 1) / threads;
typedef typename FcTypeTraits<T>::Type trans_type;
auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(B);
auto* data_ptr_v4 = reinterpret_cast<trans_type*>(Y);
if (relu) {
bias_relu_v4<trans_type, true><<<blocks, threads, 0, context.stream()>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
} else {
bias_relu_v4<trans_type, false><<<blocks, threads, 0, context.stream()>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
}
} else {
const int threads = 256;
const int blocks = M;
if (relu) {
InplaceAddReluKernel<T,
true,
threads><<<blocks, threads, 0, context.stream()>>>(
N, B, Y);
} else {
const int threads = 256;
const int blocks = M;
if (relu) {
InplaceAddReluKernel<T, true,
threads><<<blocks, threads, 0, context.stream()>>>(
N, B, Y);
} else {
InplaceAddReluKernel<T, false,
threads><<<blocks, threads, 0, context.stream()>>>(
N, B, Y);
}
InplaceAddReluKernel<T,
false,
threads><<<blocks, threads, 0, context.stream()>>>(
N, B, Y);
}
}
};
}
template class FCFunctor<paddle::platform::CUDADeviceContext, float>;
template class FCFunctor<paddle::platform::CUDADeviceContext, double>;
template class FCFunctor<platform::CUDADeviceContext, float>;
template class FCFunctor<platform::CUDADeviceContext, double>;
template class FCFunctor<GPUContext, float>;
template class FCFunctor<GPUContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -17,19 +17,23 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
namespace phi {
namespace funcs {
template <typename DeviceContext, typename T>
class FCFunctor {
public:
void operator()(const DeviceContext& context, const int M, const int N,
const int K, const T* X, const T* W, T* Y,
const T* B = nullptr, bool relu = false,
void operator()(const DeviceContext& context,
const int M,
const int N,
const int K,
const T* X,
const T* W,
T* Y,
const T* B = nullptr,
bool relu = false,
bool weight_pass = false);
};
} // namespace math
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册