diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index 7c784ce474bbb2588dcf78ecded740777445fc80..fd84cb8e46c974d23816c0dd4c99a545d996c409 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -25,21 +25,22 @@ namespace operators { template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { const Tensor *input = param->InputX(); - if (input->type() == typeid(half)) { - auto input_ptr = input->data(); - auto output_ptr = param->Out(); - fpga::BypassArgs args; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = (void *)(input_ptr); - args.image.height = (uint32_t)input->dims()[0]; - args.image.width = (uint32_t)input->dims()[1]; - args.image.channels = 1; - args.output.address = output_ptr; - param->SetFpgaArgs(args); - } + auto input_ptr = input->data(); + auto output_ptr = param->Out(); + Tensor *floatInput = new Tensor(*input); + fpga::BypassArgs args; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = (void *)(input_ptr); + args.image.height = (uint32_t)input->dims()[0]; + args.image.width = (uint32_t)input->dims()[1]; + args.image.channels = 1; + args.output.address = (void *)floatInput->mutable_data(); + + param->SetFloatInput(floatInput); + param->SetFpgaArgs(args); return true; } @@ -48,8 +49,12 @@ void SoftmaxKernel::Compute( const SoftmaxParam ¶m) const { DLOG << "======================================= FPGA SoftMAX " "==============================================="; - const Tensor *in_x = param.InputX(); + const Tensor *in_x = param.FloatInput(); Tensor *out = param.Out(); + fpga::fpga_flush((void *)in_x->data(), in_x->memory_size()); + fpga::PerformBypass(param.FpgaArgs()); + fpga::fpga_invalidate(out->data(), out->memory_size()); + auto x_dims = in_x->dims(); out->Resize(x_dims); math::SoftmaxFuntor()(in_x, out); diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index 6ef9fb2a8252e82014ebebc22f82066eeb324c0d..14269817ededd097c4c9ade20be5ee773c02d692 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -36,13 +36,35 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, int N = dim_out[1]; int K = (!trans_a) ? dim_a[1] : dim_a[0]; + if (trans_a) { + int numel = matrix_a.numel(); + int m = matrix_a.dims()[0]; + int n = matrix_a.dims()[1]; + float *tmp = (float *)(matrix_a.data()); + float *a = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * numel)); + int index = 0; + for (int j = 0; j < n; j++) { + for (int i = 0; i < m; i++) { + a[index++] = tmp[i * n + j]; + } + } +#ifdef _OPENMP + Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#else + Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#endif + } else { #ifdef _OPENMP - Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), - N, beta, matrix_out->data(), N, relu, bias); + Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), + N, beta, matrix_out->data(), N, relu, bias); #else - Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, - beta, matrix_out->data(), N, relu, bias); + Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, + beta, matrix_out->data(), N, relu, bias); #endif + } } template <> diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 1728e6a6cc778ec223c3f14c971404ba3a5cc0f7..1c5815c64236f1b67fb6ab7752d0c4caef7c2646 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -785,7 +785,7 @@ class SoftmaxParam : public OpParam { fpga::BypassArgs fpga_bypass_args; public: - RType *FloatInput() { + RType *FloatInput() const { return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); } void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }