diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h
index 42e99f4e4238d6974d23c1fb33bf238ca8a8626d..3db15c6ddc1843a5b6e91b0267ddbc0e606d87f8 100644
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
@@ -31,90 +31,110 @@ void* fpga_malloc(size_t size);
 void fpga_free(void* ptr);
 void fpga_copy(void* dst, const void* src, size_t num);
 
-struct FpgaVersionArgs {
-  void* buf;
-};
-
-struct MemoryToPhysicalArgs {
-  const void* src;
-  uint64_t physical;
+struct VersionArgs {
+  void* buffer;
 };
 
 struct MemoryCopyArgs {
   void* src;
-  void* dst;
+  void* dest;
   size_t size;
 };
 
-struct FpgaQuantArgs {
-  float scale;
-};
-
-struct FpgaBNArgs {
-  bool enabled = false;
-  void* bias_addr;
-  void* scale_addr;
+struct BNArgs {
+  bool enabled;
+  void* bias_address;
+  void* scale_address;
 };
 
-struct FpgaKernelArgs {
+/**
+Conv and Pooling kernel
+*/
+struct KernelArgs {
   uint32_t width;
   uint32_t height;
-  uint32_t stride_h;
   uint32_t stride_w;
+  uint32_t stride_h;
 };
 
-struct FpgaImageArgs {
-  uint32_t width;
-  uint32_t height;
+struct ImageInputArgs {
+  void* address;  // input featuremap virtual address
   uint32_t channels;
-  uint32_t pad_h;
-  uint32_t pad_w;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
 };
 
-struct FpgaConvArgs {
+struct ConvArgs {
   bool relu_enabled;
-  struct FpgaBNArgs BNargs;
-  void* image_addr;
-  void* filter_addr;
-  void* bias_addr;
-  void* output_addr;
-  float quant_scale;
-  struct FpgaImageArgs image;
+  float scale;  // input scale;
+  void* bias_address;
+  void* filter_address;
   uint32_t filter_num;
   uint32_t group_num;
 
-  struct FpgaKernelArgs kernel;
+  struct BNArgs bn;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+  struct KernelArgs kernel;
 };
 
-struct FpgaPoolArgs {
-  void* image_addr;
-  void* output_addr;
-  struct FpgaImageArgs image;
-  struct FpgaKernelArgs kernel;
+struct PoolingArgs {
+  float scale;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+  struct KernelArgs kernel;
 };
 
-struct FpgaEWAddArgs {
+// elementwise add arguments
+struct EWAddArgs {
   bool relu_enabled;
-  void* image0_addr;
-  void* image1_addr;
-  void* result_addr;
-  uint32_t const0;
-  uint32_t const1;
-  uint32_t data_len;  // aligned element count
+  float scale;
+
+  float const0;  // output0 = const0 x input0 + const1 x input1;
+  float const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
 };
 
+struct FpgaRegWriteArgs {
+  uint64_t address;  //
+  uint64_t value;
+};
+
+struct FpgaRegReadArgs {
+  uint64_t address;
+  uint64_t value;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 02, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 03, struct FpgaRegWriteArgs)
+
+#define IOCTL_SEPARATOR_0 10
+
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+
+#define IOCTL_SEPARATOR_1 20
+
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+
+//============================== API =============================
+
 int ComputeFpgaConv(struct FpgaConvArgs args);
 int ComputeFpgaPool(struct FpgaPoolArgs args);
 int ComputeFpgaEWAdd(struct FpgaEWAddArgs args);
 
-#define IOCTL_FPGA_MAGIC 'CNN'
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
-#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
-#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
-
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 954a65a3605c4d0204890d9414aeb074371b0d69..388788216fe45b66441a0390e2ef09c2d51c16dc 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -253,6 +253,18 @@ class Tensor {
                           "Tensor's dims_ is out of bound. ");
   }
 
+#ifdef PADDLE_MOBILE_FPGA
+  struct FPGAArgs {
+    float scale;
+
+    inline float *scale_pointer() { return &scale; }
+  };
+
+  struct &fpga_args() const {
+    return fpgaArgs_;
+  }
+#endif
+
  private:
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a
@@ -319,6 +331,10 @@ class Tensor {
    * begins.
    */
   size_t offset_;
+
+#ifdef PADDLE_MOBILE_FPGA
+  FPGAArgs fpgaArgs_;
+#endif
 };
 
 #ifdef PADDLE_MOBILE_DEBUG
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
index c691988f4a388c7835a7016602d7a1ac9cb5f9b6..627a94242ca4638640a7961120b36c9f763a0e85 100644
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -39,7 +39,7 @@ void ConcatKernel<FPGA, half>::Compute(const ConcatParam &param) const {
 
   for (int i = 0; i < inputs.size(); ++i) {
     auto input = inputs[i];
-    auto channels = input[3];
+    auto channels = input->dims()[3];
     out_offset += channels;
     auto src = input->data<half>();
     for (int j = 0; j < pixels; ++j) {