diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ccf73763c08a748b53027d7f4a0f254774a1843..d7f063e4689bfc7022f087d81003d47c342135dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.6)
 project(paddle-mobile)
 
-option(DEBUGING "enable debug mode" ON)
+option(DEBUGING "enable debug mode" OFF)
 option(USE_OPENMP "openmp support" OFF)
-option(USE_EXCEPTION "use std exception" ON)
-option(LOG_PROFILE "log profile" ON)
+option(USE_EXCEPTION "use std exception" OFF)
+option(LOG_PROFILE "log profile" OFF)
 # select the platform to build
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
diff --git a/src/ios_io/PaddleMobile.mm b/src/ios_io/PaddleMobile.mm
index e3ed909394a1057302fb0f747b582b944c89cc65..9298e7907709bbbed77b3d4d76528689cae7bd93 100644
--- a/src/ios_io/PaddleMobile.mm
+++ b/src/ios_io/PaddleMobile.mm
@@ -13,6 +13,7 @@
  limitations under the License. */
 
 #import "PaddleMobile.h"
+
 #import "op_symbols.h"
 #import "io/paddle_mobile.h"
 
@@ -23,6 +24,8 @@
 {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
   BOOL loaded_;
+  std::vector<float> *predict_input_;
+
 }
 @end
 
@@ -55,7 +58,7 @@ static std::mutex shared_mutex;
 - (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
   std::string model_path_str = std::string([modelPath UTF8String]);
   std::string weights_path_str = std::string([weighsPath UTF8String]);
-  if (loaded_ = pam_->Load(model_path_str, weights_path_str, false)) {
+  if (loaded_ = pam_->Load(model_path_str, weights_path_str, true)) {
     return YES;
   } else {
     return NO;
@@ -102,7 +105,26 @@ static std::mutex shared_mutex;
 }
 
 - (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
-  std::lock_guard<std::mutex> lock(shared_mutex);
+//  printf(" hi i am here");
+  if (predict_input_) {
+//    printf(" fukc -- ");
+//    printf(" %d \n", predict_input_->size());
+    // dim to c++ vector, get numel
+    std::vector<int64_t > dim_vec = {1, 3, 300, 300};
+//    int numel = 1;
+//    for (int k = 0; k < dim.count; ++k) {
+//      int d = dim[k].intValue;
+//      numel *= d;
+//      dim_vec.push_back(d);
+//    }
+
+
+    std::vector<float> cpp_result = pam_->Predict(*predict_input_, dim_vec);
+    return nil;
+  }
+//  printf(" predict one ");
+
+//  std::lock_guard<std::mutex> lock(shared_mutex);
   if (!loaded_) {
     printf("PaddleMobile doesn't be loaded yet");
     return nil;
@@ -141,13 +163,15 @@ static std::mutex shared_mutex;
   }
 
   // input
-  std::vector<float> predict_input;
+  std::vector<float> *predict_input = new std::vector<float>();
   for (int j = 0; j < numel; ++j) {
-    predict_input.push_back(dataPointer[j]);
+    predict_input->push_back(dataPointer[j]);
   }
 
+  predict_input_ = predict_input;
+
   // predict
-  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
+  std::vector<float> cpp_result = pam_->Predict(*predict_input, dim_vec);
 
   // result
   long count = 0;
@@ -157,6 +181,7 @@ static std::mutex shared_mutex;
     [result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
   }
 
+
   free(output);
 
   // 待验证
diff --git a/src/ios_io/op_symbols.h b/src/ios_io/op_symbols.h
index b2825b90e67c4e20030509358f468c9c0190f727..0fe1137278d19ab4c9c9aaecf2db108e4a184993 100644
--- a/src/ios_io/op_symbols.h
+++ b/src/ios_io/op_symbols.h
@@ -25,6 +25,8 @@
 #include "operators/fetch_op.h"
 #include "operators/fusion_conv_add.h"
 #include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/fusion_conv_bn_relu_op.h"
+#include "operators/fusion_dwconv_bn_relu_op.h"
 #include "operators/fusion_fc_op.h"
 #include "operators/im2sequence_op.h"
 #include "operators/lrn_op.h"
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
index 76af743818edacac6dd9e1878e8d8220ccff6d73..9dc3dbafed990de2f4057d98a2accdd8ce2fd7db 100644
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -66,6 +66,9 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
         }
         float *out_ptr = output_data + i * output_batch_stride +
                          c * output_channel_stride + ph / 2 * output_width;
+#if __ARM_NEON
+#if __aarch64__
+#else
         asm volatile(
             "subs       %[w1], %[w1], #1        \n\t"
             "blt        end_w1_%=               \n\t"
@@ -115,6 +118,8 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
               [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
             : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
               "q9");
+#endif
+#endif
 
         if (_w2 != 0) {
           in_ptr1 += 16 * w1 + 4 * w2;
@@ -183,6 +188,9 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
         }
         float *out_ptr = output_data + i * output_batch_stride +
                          c * output_channel_stride + ph / 2 * output_width;
+#if __ARM_NEON
+#if __aarch64__
+#else
         asm volatile(
             "subs       %[w1], %[w1], #1        \n\t"
             "blt        end_w1_%=               \n\t"
@@ -238,6 +246,8 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
               [quarter] "r"(quarter)
             : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
               "q9", "q10");
+#endif
+#endif
 
         if (_w2 != 0) {
           in_ptr1 += 16 * w1 + 4 * w2;