diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
index 477d241e93fc05a7197f84b495f0faf0b3badbef..0cb872366938ee37de1c6ec4c362152949710151 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -56,11 +56,17 @@ void *fpga_malloc(size_t size) {
   return reinterpret_cast<void *>(
       mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
 #else
-  return NULL;
+  return malloc(size);
 #endif
 }
 
-void fpga_free(void *ptr) { munmap(ptr, 0); }
+void fpga_free(void *ptr) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  munmap(ptr, 0);
+#else
+  free(ptr);
+#endif
+}
 
 void fpga_copy(void *dest, const void *src, size_t num) {
   memcpy(dest, src, num);
diff --git a/src/fpga/quantization.cpp b/src/fpga/quantization.cpp
index 560a1aa0059cbaffe36dd570e3f2f38ab8943379..44994d4c353490b533110d0965fb63b4fb5c7aa2 100644
--- a/src/fpga/quantization.cpp
+++ b/src/fpga/quantization.cpp
@@ -48,16 +48,11 @@ static Dtype find_max(Dtype* data, int64_t num) {
 
 // template <typename Dtype>
 void quantize_filter(framework::Tensor* filter) {
-  DLOG << "quantilize_filter........";
+  DLOG << "quantilize_filter........" << filter->dims();
 
   float scale = 0;
   auto fix_range = static_cast<float>(std::pow(2, 8 - 1) - 1);
 
-  const auto batch_size = filter->dims()[0];
-  const auto channel = filter->dims()[1];
-  const auto height = filter->dims()[2];
-  const auto width = filter->dims()[3];
-
   auto* tmp_data = new int8_t[filter->numel()];
 
   // 32bit filter -> 8bit filter;
@@ -76,9 +71,19 @@ void quantize_filter(framework::Tensor* filter) {
     scale = (fix_range / max);
     std::memcpy(tmp_data, filter->data<int8_t>(), (size_t)filter->numel());
   }
-  // NCHW -> NHWC;
-  chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
-                     channel, height, width);
+
+  if (filter->dims().size() == 4) {
+    const auto batch_size = filter->dims()[0];
+    const auto channel = filter->dims()[1];
+    const auto height = filter->dims()[2];
+    const auto width = filter->dims()[3];
+    chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
+                       channel, height, width);
+  } else if (filter->dims().size() == 2) {
+    std::memcpy(filter->mutable_data<int8_t>(), tmp_data,
+                (size_t)filter->numel());
+  }
+
   delete tmp_data;
   filter->SetFpgaScale(scale);
 }
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 09bff80af723161dfaf31d58f3ec24528ef1ccc4..2bd4c0ac6ba3c7b066cc7ad2439ab6bebb7c3cd9 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -26,7 +26,7 @@ namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
 
-#ifdef PADDLE_MOBILE_FPGA__VV
+#ifdef PADDLE_MOBILE_FPGA
 namespace fpga = paddle_mobile::fpga;
 
 void Copy(void *dst, const void *src, size_t num) {
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
index 6424de8afe705e13534a3452fc04890c0f750b9f..d58ab0f751eeb584f286a0920d08e9473be38402 100644
--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -32,9 +32,9 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam *param) {
     args.convert_type = fpga::DATA_FP16_TO_FP32;
     args.layout_type = fpga::LAYOUT_HWC_TO_CHW;
     args.image.address = (void *)(input_ptr);
-    args.image.height = input->dims()[1];
-    args.image.width = input->dims()[2];
-    args.image.channels = input->dims()[3];
+    args.image.height = input->dims()[0];
+    args.image.width = input->dims()[1];
+    args.image.channels = 1;
     args.output.address = output_ptr;
     param->SetFpgaArgs(args);
   }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a144e553236e300ad24501420183df01dd15aad5..dab8bcc977054f90a2ec82899b9ab64c426d1fb6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -31,6 +31,9 @@ elseif("FPGAnets" IN_LIST NET)
     # target_link_libraries(test-resnet paddle-mobile)
     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-tensor-quant paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-fpga-concat-op paddle-mobile)
 elseif("mobilenetssd" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d1a5828b36b3d9ed371a271af6db82657ff1596
--- /dev/null
+++ b/test/fpga/test_concat_op.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/concat_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::FPGA> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::FPGA,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::FPGA, float>>
+      executor(program, "concat");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  // 5. test one example.
+  int input_n = 1;
+  int input_c = 2;
+  int input_h = 0;
+  int input_w = 1;
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
+  /// inputx1 (4,10,2,2),
+  /// inputx2 (4,20,2,2),
+  /// inputx3 (4,30,2,2),
+  /// inputx4 (4,40,2,2),
+  /// axis = 1
+  /// output (4,100,2,2)
+  int input_index =
+      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
+  int output_index = input_n * 100 * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
+                     input_h * 2 + input_w;
+
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
+  return 0;
+}