diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index ac5a0681f1bbf8417ccb154d9bc82a353a4acb83..6f4b548155ca91ab01a6426cca6ba92ce4f9340e 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -190,6 +190,9 @@ void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(PaddleTensor *output,
   } else if (tensor_ptr.get()->type() == type_id<float>().hash_code()) {
     data_addr = tensor_ptr.get()->data<float>();
     data_sizeof = sizeof(float);
+  } else if (tensor_ptr.get()->type() == type_id<int8_t>().hash_code()) {
+    data_addr = tensor_ptr.get()->data<int8_t>();
+    data_sizeof = sizeof(int8_t);
   } else {
     PADDLE_MOBILE_ENFORCE(0, "output typeid is not supported");
   }
diff --git a/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
index 790c8dbd53bfa728d26b28422a776517ebd167e2..c6b8f9e85247865fd344bc86a365cdd26d3f5ec0 100644
--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
@@ -73,7 +73,7 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   int unalignedCW = outC * outW;
   int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
   if (input->type() == type_id<float>()) {
-    if (unalignedCW == alignedCW) {
+    if ((output->dims().size() != 4) || (unalignedCW == alignedCW)) {
       output->ShareDataWith(*input);
     } else {
       auto input_address = input->data<float>();
@@ -90,7 +90,7 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
   if (input->fpga_data_num < num_th) {
     for (int idx = 0; idx < product(input->dims()); ++idx) {
-      outdata_ptr[idx] = input_address[idx] * Si;
+      outdata_ptr[idx] = input_address[idx] / 127.0 * Si;
     }
     fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
     return;
@@ -101,14 +101,14 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
     auto aligned_ptr = aligned_out->data<float>();
     fpga::fpga_invalidate(aligned_ptr, (input->fpga_data_num) * sizeof(float));
     for (int idx = 0; idx < input->fpga_data_num; ++idx) {
-      aligned_ptr[idx] = input_address[idx] * Si;
+      aligned_ptr[idx] = input_address[idx] / 127.0 * Si;
     }
     dealign(aligned_ptr, outdata_ptr, outC, outH, outW);
     fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
     return;
   }
   for (int idx = 0; idx < input->fpga_data_num; ++idx) {
-    outdata_ptr[idx] = input_address[idx] * Si;
+    outdata_ptr[idx] = input_address[idx] / 127.0 * Si;
   }
   fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
 }
diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
index 07bf3bb807eb3d23300f77403847fb0e0e4ff3aa..4f75e0f30b2e9f57d94941e972d012016b55251e 100755
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -27,8 +27,6 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 
   auto out = param->Out();
   out->Resize(framework::make_ddim(dims));
-  out->mutable_data<int8_t>(framework::make_ddim(dims));
-  fpga::format_ofm(out);
 
   PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
                         "Softmax should have 4-order input");
@@ -44,6 +42,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
     auto input_ptr = input->data<int8_t>();
     float Si = input->scale[0];
     int16_t slope = fpga::fp32_2_fp16(Si / 127);
+    out->mutable_data<int8_t>(framework::make_ddim(dims));
     fpga::format_ofm(out);
     fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
     args.input_layout_type = fpga::LAYOUT_HWC;
@@ -65,17 +64,11 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
     float_input_x->Resize(input->dims());
     float_input_x->init(type_id<float>().hash_code());
     fpga::format_ofm(float_input_x.get());
-    auto float_out = param->float_out;
-    float_out = std::make_shared<Tensor>();
-    float_out->Resize(input->dims());
-    float_out->init(type_id<float>().hash_code());
-    fpga::format_ofm(float_out.get());
+    out->mutable_data<float>(framework::make_ddim(dims));
+    fpga::format_ofm(out);
   } else {
-    auto float_out = param->float_out;
-    float_out = std::make_shared<Tensor>();
-    float_out->Resize(input->dims());
-    float_out->init(type_id<float>().hash_code());
-    fpga::format_ofm(float_out.get());
+    out->mutable_data<float>(framework::make_ddim(dims));
+    fpga::format_ofm(out);
   }
 
   return true;
@@ -97,41 +90,24 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
     Tensor *out = param.Out();
     out->Resize(
         {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
-    auto out_data = out->data<int8_t>();
+
     auto float_input_x = param.float_input_x_;
     auto float_input_x_data = float_input_x->data<float>();
     int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
     for (int i = 0; i < dataNum; i++) {
       float_input_x_data[i] = in_data[i] * Si / 127;
     }
-    auto float_out = param.float_out;
-    auto float_out_data = float_out->data<float>();
-    math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), float_out.get());
-    for (int i = 0; i < dataNum; i++) {
-      float tmp_out = float_out_data[i] * 127;
-      out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
-                                : (signed char)(tmp_out + 0.5);
-    }
-    fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
+    math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), out);
+    auto out_data = out->data<float>();
+    fpga::fpga_flush(out_data, dataNum * sizeof(float));
   } else {
     Tensor *out = param.Out();
     out->Resize(
         {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
-    auto out_data = out->data<int8_t>();
-    auto float_out = param.float_out;
-    float_out = std::make_shared<Tensor>();
-    float_out->Resize(in_x->dims());
-    float_out->init(type_id<float>().hash_code());
-    fpga::format_ofm(float_out.get());
-    math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get());
-    auto float_out_data = float_out->data<float>();
+    math::SoftmaxFuntor<CPU, float>()(in_x, out);
     int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
-    for (int i = 0; i < dataNum; i++) {
-      float tmp_out = float_out_data[i] * 127;
-      out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
-                                : (signed char)(tmp_out + 0.5);
-    }
-    fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
+    auto out_data = out->data<float>();
+    fpga::fpga_flush(out_data, dataNum * sizeof(float));
   }
 }