diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
index 01c610ce5b445bc603da3c0dc43ad21c35d95ae6..ca3c80b14748c0b7fe0493f71b2cbdcdfd6f19bb 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -26,8 +26,14 @@ limitations under the License. */
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <vector>
 
 #include "api.h"
+#include "bias_scale.h"
+#include "common/enforce.h"
+#include "common/types.h"
+#include "filter.h"
+#include "image.h"
 
 #define FPGA_TEST_MODE
 #ifdef FPGA_TEST_MODE
@@ -164,5 +170,59 @@ int PerformBypass(const struct BypassArgs &args) {
   return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
 }
 
+void format_image(framework::Tensor *image_tensor) {
+  auto dims = image_tensor->dims();
+  int channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = image_tensor->mutable_data<float>();
+  size_t memory_size = channel * height * width * sizeof(float);
+  float *new_data = (float *)fpga_malloc(memory_size);
+  fpga_copy(new_data, data_ptr, memory_size);
+  image::format_image(&new_data, channel, height, width);
+  image_tensor->reset_data_ptr(new_data);
+}
+
+void format_ofm(framework::Tensor *ofm_tensor) {
+  auto dims = ofm_tensor->dims();
+  int channel = dims[1], height = dims[2], width = dims[3];
+  size_t memory_size =
+      height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
+  ofm_tensor->reset_data_ptr(fpga_malloc(memory_size));
+}
+
+void format_filter(framework::Tensor *filter_tensor, int group_num) {
+  auto dims = filter_tensor->dims();
+  int num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->mutable_data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  float *new_data = (float *)fpga_malloc(memory_size);
+  fpga_copy(new_data, data_ptr, memory_size);
+  float max_value = filter::find_max(new_data, num * channel * height * width);
+  filter::format_filter(&new_data, num, channel, height, width, group_num,
+                        max_value);
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_fc_matrix(framework::Tensor *filter_tensor, int group_num,
+                      int height, int width) {
+  auto dims = filter_tensor->dims();
+  PADDLE_MOBILE_ENFORCE(dims[0] % (height * width) == 0,
+                        "Filter number should be divisible by group number");
+  int num = dims[1], channel = dims[0] / height / width;
+  auto data_ptr = filter_tensor->mutable_data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  float *new_data = (float *)fpga_malloc(memory_size);
+  fpga_copy(new_data, data_ptr, memory_size);
+  float max_value = filter::find_max(new_data, num * channel * height * width);
+  filter::format_filter(&new_data, num, channel, height, width, group_num,
+                        max_value);
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  bias_scale::format_bias_scale_array(bias_scale_array,
+                                      element_num_per_division, num);
+}
+
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/api.h b/src/fpga/api.h
index 73db2bfbdac50c123bb0204ff63d4412a2dadfe7..968e5db356823a8951d8c2de9031e25597a7e998 100644
--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cstddef>
 #include <iostream>
 #include <limits>
+#include "framework/tensor.h"
 
 // memory management;
 
@@ -175,6 +176,13 @@ int ComputeFpgaPool(const struct PoolingArgs& args);
 int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+void format_image(framework::Tensor* image_tensor);
+void format_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
+void format_filter(framework::Tensor* filter_tensor, int group_num);
+void format_fc_matrix(framework::Tensor* filter_tensor, int group_num,
+                      int height = 1, int width = 1);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
 
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp
index 51c4a0ac73869c974332f37c7bae8186c28e63c3..a1b0c8577b9100f69f823a39e9e136c46b7e09ff 100644
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -21,6 +21,7 @@ namespace fpga {
 namespace bias_scale {
 
 void align_element(float **data_in, int num_per_div_before_alignment, int num) {
+  int copynum = 0;
   float *ptr_unaligned = *data_in;
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
@@ -33,8 +34,20 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
   memset(ptr_aligned, 0, num_element * sizeof(float));
 
   for (int i = 0; i < div_num; i++) {
-    memcpy(ptr_aligned + i * num_per_div_after_alignment, ptr_unaligned,
-           num_per_div_before_alignment * sizeof(float));
+    if (i == div_num - 1) {
+      copynum = (num_per_div_after_alignment * div_num > num)
+                    ? (num % num_per_div_after_alignment)
+                    : (num_per_div_before_alignment);
+    } else {
+      copynum = num_per_div_before_alignment;
+    }
+
+    memcpy(ptr_aligned + i * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i,
+           copynum * sizeof(float));
+    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i + num,
+           copynum * sizeof(float));
   }
 
   fpga_free(ptr_unaligned);
@@ -52,14 +65,22 @@ void interleave(float **data_in, int num_after_alignment) {
     memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
            4 * sizeof(float));
     memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment * sizeof(float) + 4 * i,
-           4 * sizeof(float));
+           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
   }
 
   fpga_free(ptr_uninterleaved);
   *data_in = ptr_interleaved;
 }
 
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  align_element(bias_scale_array, element_num_per_division, num);
+  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
+  int element_num_after_division =
+      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
+  interleave(bias_scale_array, div_num * element_num_after_division);
+}
+
 }  // namespace bias_scale
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/bias_scale.h b/src/fpga/bias_scale.h
index ff56d4bcae374424b1fc1eabf59f4f0015256c7c..2d1e44c5470dae02fde6956a3744edc2e371a87b 100644
--- a/src/fpga/bias_scale.h
+++ b/src/fpga/bias_scale.h
@@ -22,6 +22,8 @@ namespace bias_scale {
 
 void align_element(float** data_in, int num_per_div_before_alignment, int num);
 void interleave(float** data_in, int num_after_alignment);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
 
 }  // namespace bias_scale
 }  // namespace fpga
diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp
index c71d75ca9b4383287e36e783ce8d3bc6760752cd..c37d07d40e7789ed1f7012abe556bd5bc3e04f28 100644
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -19,21 +19,190 @@ namespace paddle_mobile {
 namespace fpga {
 namespace filter {
 
-void convert_to_hwc(float** data_in, int num, int channel, int height,
-                    int width) {}
+int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
 
-float find_max(float* data_in, int num) { return 0; }
+int calc_split_num(int num, int division_capacity) {
+  return (num + division_capacity - 1) / division_capacity;
+}
 
-void quantize(float* data_in, int num) {}
+int calc_division_number(int num, int group_num, int division_capacity) {
+  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+                        "Filter number should be divisible by group number");
+  int split_num = calc_split_num(num, division_capacity);
+  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+                        "Split number or group number should be 1");
+  return group_num * split_num;
+}
 
-void align_element(float** data_in, int num, int chw) {}
+int calc_num_per_div(int num, int group_num, int division_capacity) {
+  if (group_num == 1) {
+    if (num > division_capacity) {
+      return division_capacity;
+    } else {
+      return num;
+    }
+  } else {
+    return (num + group_num - 1) / group_num;
+  }
+}
 
-void align_num(float** data_in, int num_per_div_before_alignment, int num,
-               int chw) {}
+void convert_to_hwc(char **data_in, int num, int channel, int height,
+                    int width) {
+  char *tmp = *data_in;
+  int chw = channel * height * width;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * chw + offset_height + w * channel + c) =
+              *((*data_in)++);
+        }
+      }
+    }
+  }
 
-void reorder(float** data_in, int num_after_alignment, int chw) {}
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
 
-void interleave(float** data_in, int num_after_alignment, int chw) {}
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+void quantize(float **data_in, int data_size, float max) {
+  float *tmp = *data_in;
+  float fix_range = 127;
+  float scale = fix_range / max;
+
+  char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = (char)((*data_in)[i] * scale);
+  }
+  *data_in = (float *)tmp_data;
+  fpga_free(tmp);
+}
+
+void align_element(char **data_in, int num, int chw) {
+  int i = 0;
+  int j = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (align_chw != chw) {
+    printf("align %d \n", align_chw);
+    char *tmp = *data_in;
+    char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
+
+    memset(data_tmp, 0, num * align_chw);
+    for (j = 0; j < num; j++) {
+      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
+    }
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void align_num(char **data_in, int num_per_div_before_alignment, int num,
+               int chw) {
+  int i = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  if (num_per_div_after_alignment != num_per_div_before_alignment) {
+    char *tmp = *data_in;
+    int div_num =
+        (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+    int num_element = div_num * num_per_div_after_alignment * align_chw;
+    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));
+
+    memset(data_tmp, 0, num_element * sizeof(char));
+
+    for (i = 0; i < div_num; i++) {
+      memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+             *data_in + num_per_div_before_alignment * align_chw * i,
+             num_per_div_before_alignment * align_chw);
+    }
+
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void reorder(char **data_in, int num_after_alignment, int chw) {
+  int index = 0;
+  int new_index;
+
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+  char *tmp = *data_in;
+  for (index = 0; index < num_after_alignment; index++) {
+    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
+                (index / 16 % 2 * 4);
+    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
+           chw_align);
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void interleave(char **data_in, int num_after_alignment, int chw) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int interleave_per_num = 16;
+  ;
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+  char *tmp = *data_in;
+  int interleave_num = chw_align * 2 / interleave_per_num;
+  for (i = 0; i < num_after_alignment; i += 2) {
+    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
+      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
+             *data_in + i * chw_align + interleave_per_num * k,
+             interleave_per_num);
+      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
+             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+             interleave_per_num);
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void format_filter(float **data_in, int num, int channel, int height, int width,
+                   int group_num, float max) {
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment * div_num;
+
+  quantize(data_in, data_size, max);
+
+  char **quantize_data = (char **)data_in;
+
+  convert_to_hwc(quantize_data, num, channel, height, width);
+  align_element(quantize_data, num, chw);
+  align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+}
 
 }  // namespace filter
 }  // namespace fpga
diff --git a/src/fpga/filter.h b/src/fpga/filter.h
index 1936c225e10ae3e7cfdb673de14fdce791fe8d69..23e6d60ac1e82c4cde9e533f201aa2f2e46dc2c0 100644
--- a/src/fpga/filter.h
+++ b/src/fpga/filter.h
@@ -22,14 +22,15 @@ namespace fpga {
 namespace filter {
 void convert_to_hwc(float** data_in, int num, int channel, int height,
                     int width);
-float find_max(float* data_in, int num);
-void quantize(float* data_in, int num);
+float find_max(float* data_in, int data_size);
+void quantize(float** data_in, int data_size, float max);
 void align_element(float** data_in, int num, int chw);
-void align_num(float** data_in, int num_per_div_before_alignment, int num,
+void align_num(char** data_in, int num_per_div_before_alignment, int num,
                int chw);
 void reorder(float** data_in, int num_after_alignment, int chw);
 void interleave(float** data_in, int num_after_alignment, int chw);
-
+void format_filter(float** data_in, int num, int channel, int height, int width,
+                   int group_num, float max);
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/image.cpp b/src/fpga/image.cpp
index 6bbf34eae933d69d00517c723326111901444ab0..88168ee2125619ed0ae509d16e4fa81e5730d766 100644
--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -11,3 +11,57 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+#include "image.h"
+#include <memory.h>
+#include "api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width) {
+  float *tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));
+  int64_t amount_per_row = width * channel;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int64_t offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void align_element_conv(float **data_in, int height, int cw) {
+  int i = 0;
+  int h = 0;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+  if (align_cw != cw) {
+    float *tmp = *data_in;
+    float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float));
+
+    memset(data_tmp, 0, height * align_cw * sizeof(float));
+
+    for (h = 0; h < height; h++) {
+      memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw),
+             cw * sizeof(float));
+    }
+
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void format_image(float **data_in, int channel, int height, int width) {
+  convert_to_hwc(data_in, channel, height, width);
+  align_element_conv(data_in, height, channel * width);
+}
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/image.h b/src/fpga/image.h
index 6bbf34eae933d69d00517c723326111901444ab0..83ba5bc4d04ce4facaf9441cebe15534bf200f91 100644
--- a/src/fpga/image.h
+++ b/src/fpga/image.h
@@ -11,3 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+#pragma once
+#define IMAGE_ALIGNMENT 16  // Aligned to 16
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float** data_in, int channel, int height, int width);
+void align_element_conv(float** data_in, int height, int cw);
+void format_image(float** data_in, int channel, int height, int width);
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 6fc16a01a2874f04ecea3edb89774f4deea93dd5..ea2ae9b991248144b90a44a165cb322b9c21716e 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -254,30 +254,6 @@ class Tensor {
                           "Tensor's dims_ is out of bound. ");
   }
 
-#ifdef PADDLE_MOBILE_FPGA
-  struct FPGAArgs {
-    friend class Tensor;
-
-    inline float *scale_pointer() { return scale_; }
-    inline float scale() { return *scale_; }
-
-   private:
-    float *scale_;
-  };
-
-  struct FPGAArgs fpga_args() const {
-    FPGAArgs args;
-    args.scale_ = scale.get();
-    return args;
-  }
-
-  void SetFpgaScale(float s) { *(scale.get()) = s; }
-
- private:
-  std::shared_ptr<float> scale = std::make_shared<float>(0);
-
-#endif
-
  private:
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a
@@ -313,9 +289,12 @@ class Tensor {
     virtual std::type_index type() const { return type_; }
 
     virtual void set_type(std::type_index type) { type_ = type; }
-
+#ifndef PADDLE_MOBILE_FPGA
     /*! the pointer of memory block. */
     std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
+#else
+    std::shared_ptr<uint8_t> ptr_;
+#endif
 
     /*! the size of memory block. */
     size_t size_;
@@ -344,6 +323,34 @@ class Tensor {
    * begins.
    */
   size_t offset_;
+#ifdef PADDLE_MOBILE_FPGA
+ public:
+  inline void reset_data_ptr(void *p) {
+    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);
+  }
+
+  struct FPGAArgs {
+    friend class Tensor;
+
+    inline float *scale_pointer() { return scale_; }
+    inline float scale() { return *scale_; }
+
+   private:
+    float *scale_;
+  };
+
+  struct FPGAArgs fpga_args() const {
+    FPGAArgs args;
+    args.scale_ = scale.get();
+    return args;
+  }
+
+  void SetFpgaScale(float s) { *(scale.get()) = s; }
+
+ private:
+  std::shared_ptr<float> scale = std::make_shared<float>(0);
+
+#endif
 };
 
 #ifdef PADDLE_MOBILE_DEBUG