diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp
index 779c846d1f3c465e5113f805b2b3856a1a7894c5..1a0fb3839e753d77aa13e24b900be893e7ab52c9 100644
--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -36,7 +36,11 @@ static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
 
 static inline int do_ioctl(int req, const void *arg) {
+#ifdef PADDLE_MOBILE_OS_LINUX
   return ioctl(req, (unsigned int64_t)arg);
+#else
+  return -1;
+#endif
 }
 
 int open_device() {
@@ -48,8 +52,12 @@ int open_device() {
 
 // memory management;
 void *fpga_malloc(size_t size) {
+#ifdef PADDLE_MOBILE_OS_LINUX
   return reinterpret_cast<void *>(
       mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+#else
+  return NULL;
+#endif
 }
 
 void fpga_free(void *ptr) { munmap(ptr, 0); }
diff --git a/src/fpga/fpga_quantilization.cpp b/src/fpga/fpga_quantilization.cpp
index 8b351f1a81e0a92f0e2f12a3f61dd2a7d3948c85..e8faf792b9b3050ff2d5b82978154004c1d78bfa 100644
--- a/src/fpga/fpga_quantilization.cpp
+++ b/src/fpga/fpga_quantilization.cpp
@@ -19,15 +19,13 @@ namespace paddle_mobile {
 namespace fpga {
 
 template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
-                       int height, int width) {
-  int offset_height = 0;
-
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
+                       int64_t channel, int64_t height, int64_t width) {
   for (int n = 0; n < num; n++) {
-    int amount_per_row = width * channel;
+    int64_t amount_per_row = width * channel;
     for (int c = 0; c < channel; c++) {
       for (int h = 0; h < height; h++) {
-        int offset_height = h * amount_per_row;
+        int64_t offset_height = h * amount_per_row;
         for (int w = 0; w < width; w++) {
           *(data_out + offset_height + w * channel + c) = *(data_in++);
         }
@@ -38,10 +36,12 @@ static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
 }
 
 template <typename Dtype>
-static Dtype find_max(Dtype* data, int num) {
+static Dtype find_max(Dtype* data, int64_t num) {
   Dtype max = 0;
   for (int i = 0; i < num; ++i) {
-    max = std::max(max, data[i]);
+    Dtype value = data[i];
+    Dtype abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
   }
   return max;
 }
@@ -51,40 +51,36 @@ void quantify_filter(framework::Tensor* filter) {
   DLOG << "quantilize_filter........";
 
   float scale = 0;
-  float fix_range = static_cast<float>((1 << (8 - 1)) - 1);
+  auto fix_range = static_cast<float>(std::pow(2, 8 - 1) - 1);
 
-  const int batch_size = filter->dims()[0];
-  const int channel = filter->dims()[1];
-  const int height = filter->dims()[2];
-  const int width = filter->dims()[3];
+  const auto batch_size = filter->dims()[0];
+  const auto channel = filter->dims()[1];
+  const auto height = filter->dims()[2];
+  const auto width = filter->dims()[3];
 
-  int8_t* int_data = nullptr;
-  int8_t* tmp_data = new int8_t[filter->numel()];
+  auto* tmp_data = new int8_t[filter->numel()];
 
   // 32bit filter -> 8bit filter;
   if (filter->type() == typeid(float)) {
-    float* float_data = filter->data<float>();
-    float max = find_max<float>(float_data, filter->numel());
+    auto* float_data = filter->data<float>();
+    auto max = find_max<float>(float_data, filter->numel());
 
-    scale = (max / fix_range);
+    scale = (fix_range / max);
+    DLOG << "scale:" << scale;
 
     for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = (int8_t)float_data[i] * scale;
+      tmp_data[i] = (int8_t)(float_data[i] * scale);
     }
-    int_data = filter->mutable_data<int8_t>();
   } else {
-    int8_t max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
-    scale = (max / fix_range);
-
-    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = filter->data<int8_t>()[i];
-    }
-    int_data = filter->mutable_data<int8_t>();
+    auto max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
+    scale = (fix_range / max);
+    std::memcpy(tmp_data, filter->data<int8_t>(), (size_t)filter->numel());
   }
   // NCHW -> NHWC;
-  chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
+  chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
+                     channel, height, width);
   delete tmp_data;
-  *(filter->fpga_args().scale_pointer()) = scale;
+  filter->SetFpgaScale(scale);
 }
 
 }  // namespace fpga
diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/fpga_quantilization.h
index 4f1f6ad402a3ff4df773ecbd2121820f4c7dc265..04cb2ce7c0a6df0df2c49431e49d2c5e73d44209 100644
--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/fpga_quantilization.h
@@ -21,10 +21,9 @@ namespace paddle_mobile {
 namespace fpga {
 
 template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
-                       int height, int width);
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
+                       int64_t channel, int64_t height, int64_t width);
 
-// template <typename Dtype>
 void quantify_filter(framework::Tensor* filter);
 
 }  // namespace fpga
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 797fcf5bffbe5e738fe352d1ca84602f0e5d86a0..721d4ea5e93cf305880ea124a58769f4fa99db62 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -64,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
+                    size_t>
       functor;
   size_t size = functor(type);
 
@@ -255,14 +256,26 @@ class Tensor {
 
 #ifdef PADDLE_MOBILE_FPGA
   struct FPGAArgs {
-    float scale;
+    friend class Tensor;
 
-    inline float *scale_pointer() { return &scale; }
+    inline float *scale_pointer() { return scale_; }
+    inline float scale() { return *scale_; }
+
+   private:
+    float *scale_;
   };
 
   struct FPGAArgs fpga_args() const {
-    return fpgaArgs_;
+    FPGAArgs args;
+    args.scale_ = scale.get();
+    return args;
   }
+
+  void SetFpgaScale(float s) { *(scale.get()) = s; }
+
+ private:
+  std::shared_ptr<float> scale = std::make_shared<float>(0);
+
 #endif
 
  private:
@@ -331,10 +344,6 @@ class Tensor {
    * begins.
    */
   size_t offset_;
-
-#ifdef PADDLE_MOBILE_FPGA
-  FPGAArgs fpgaArgs_;
-#endif
 };
 
 #ifdef PADDLE_MOBILE_DEBUG
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
index 93bbfe9c1a8ae3d9930c759ba0efcef04e5e572f..e624104acf0561470e8aac827c233d0d2d1d9f66 100644
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -61,7 +61,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
   param->SetNewBias(new_bias);
 
   fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
index d5e79a39b79494d543e6e9485497a540a15152aa..d6fee838390f0efe38a539c3a9e8fc09d07a68d0 100644
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -57,7 +57,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) {
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
   fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
index 3b44506f65cc6700323c3d5f7d0765c9e52f7e0a..fd6379d8f3021d9d859d81f75aaba9ad761dd6ca 100644
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -41,7 +41,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam *param) {
   }
 
   fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp
index fd95f47a1fcb8c444172909abc67ad7f5e0de632..559b948b7b268181dcf75a4eaa40cfd9c78ef0d6 100644
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -56,7 +56,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam *param) {
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
   fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
index fbb3ca512ea863c49ca4da3f9a133f8c91897b53..cfdc85b091017aebaf99d806e6e9104cbcbe05bd 100644
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -53,7 +53,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam *param) {
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
   fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
 
   fpga::ConvArgs convArgs;
   convArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95bcb28f9c30481bd234d83ab44b415d59388475
--- /dev/null
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#include "../softmax_kernel.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "common/types.h"
+#include "fpga/api/fpga_api.h"
+#include "operators/math/softmax.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam *param) {
+  const Tensor *input = param->InputX();
+  if (input->type() == typeid(half)) {
+    auto input_ptr = input->data<half>();
+    auto output_ptr = param->Out();
+    fpga::BypassArgs args;
+    args.convert_type = fpga::DATA_FP16_TO_FP32;
+    args.layout_type = fpga::LAYOUT_HWC_TO_CHW;
+    args.image.address = (void *)(input_ptr);
+    args.image.height = input->dims()[1];
+    args.image.width = input->dims()[2];
+    args.image.channels = input->dims()[3];
+    args.output.address = output_ptr;
+    param->SetFpgaArgs(args);
+  }
+
+  return true;
+}
+
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam &param) const {
+  // SoftmaxCompute<float>(param);
+}
+
+template class SoftmaxKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index af8d35c9ecfb217c71fc024722608d8df28b5090..c39d9657bc50c6dd708f0cd9fd5573642d417f21 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -580,6 +580,21 @@ class SoftmaxParam : public OpParam {
  private:
   Tensor *input_x_;
   Tensor *out_;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  std::shared_ptr<Tensor> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  Tensor *FloatInput() {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif
 
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index c9edfccf4ff08e5a12d735526c3d63c689711357..e85edc69c3291c794f2eeb8119b91b2926c4d870 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -34,6 +34,7 @@ REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif
 
 #endif
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index f645d7edf7a3b9f7a92cf286feec58e960a5e3b7..bacae23b522daf1cc689a2d7af6b14cd2bc794bb 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -55,6 +55,7 @@ USE_OP_CPU(softmax);
 USE_OP_MALI_GPU(softmax);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(softmax);
 #endif
 
 #endif