diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index a5ec5e8a333fb6a9ecfc04695a4155213db9e810..7c66f932df3df9793f116c8e62fea704e346b146 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -59,6 +59,7 @@ template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
   RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
+  DLOG << "-------------" << type_ << "----------------------------";
   vector<string> input_keys = GetInputKeys();
   for (const auto key : input_keys) {
     Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 47936921a6984c61cc02c222461346081b5bccdf..562ba92adbfc5862c67b33bf5dab323f33768480 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -73,6 +73,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
   depManager.resize(blocks.size());
 #endif
+  DLOG << "executer in loaddable mode: " << loddable_;
   for (int i = 0; i < blocks.size(); ++i) {
     std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
     std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
@@ -82,7 +83,6 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
       auto op_base = framework::OpRegistry<Dtype>::CreateOp(
           op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
           program_.scope);
-      DLOG << "executer in loaddable mode: " << loddable_;
       // use pre_infershape to pre resize , but if u use an lod mode tensor u
       // need to resize in runtime
       if (!loddable_) {
diff --git a/src/operators/flatten_op.cpp b/src/operators/flatten_op.cpp
index 1c84cb1079564770792881695c54b08233708cda..0282414ca6ed0be743849e9d295a354144fccdb9 100644
--- a/src/operators/flatten_op.cpp
+++ b/src/operators/flatten_op.cpp
@@ -19,22 +19,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-static std::vector<int32_t> GetOutputShape(const int axis,
-                                           const framework::DDim &in_dims) {
-  int64_t outer = 1, inner = 1;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    if (i < axis) {
-      outer *= in_dims[i];
-    } else {
-      inner *= in_dims[i];
-    }
-  }
-  std::vector<int32_t> out_shape(2);
-  out_shape[0] = static_cast<int>(outer);
-  out_shape[1] = static_cast<int>(inner);
-  return out_shape;
-}
-
 template <typename DeviceType, typename T>
 void FlattenOp<DeviceType, T>::InferShape() const {
   PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
diff --git a/src/operators/flatten_op.h b/src/operators/flatten_op.h
index 279f2e4aa3ff8efa6617bf9ee3bde7164c8a391a..4c1f6ff8a0f2b3212750f3be4d1a6aa2bad790ee 100644
--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -24,7 +24,21 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
-
+inline std::vector<int32_t> GetOutputShape(const int axis,
+                                           const framework::DDim &in_dims) {
+  int64_t outer = 1, inner = 1;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i < axis) {
+      outer *= in_dims[i];
+    } else {
+      inner *= in_dims[i];
+    }
+  }
+  std::vector<int32_t> out_shape(2);
+  out_shape[0] = static_cast<int>(outer);
+  out_shape[1] = static_cast<int>(inner);
+  return out_shape;
+}
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
diff --git a/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h b/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
index 3d98473ffee4bf526eacb95ef4477a1db6e5b186..3840985ab8a963eae7d9a4cf96d9a55acf38f68c 100644
--- a/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
+++ b/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
@@ -23,69 +23,66 @@ namespace operators {
 
 template <typename P>
 void BilinearInterpCompute(const BilinearInterpParam<CPU>& param) {
-    auto out_dims = param.Out()->dims();
-    auto* input = param.InputX()->data<float>();
-    auto out_size_t = param.InputOutPutSize();
+  auto out_dims = param.Out()->dims();
+  auto* input = param.InputX()->data<float>();
+  auto out_size_t = param.InputOutPutSize();
 
-    int out_h = param.OutH();
-    int out_w = param.OutW();
-    if (out_size_t != nullptr) {
-        auto out_size_data = out_size_t->data<int>();
-        out_h = out_size_data[0];
-        out_w = out_size_data[1];
-    }
-    auto* output = param.Out()->mutable_data<float>(
-            {out_dims[0], out_dims[1], out_h, out_w});
-    auto batch_size = param.InputX()->dims()[0];
-    auto channels = param.InputX()->dims()[1];
-    auto in_h = param.InputX()->dims()[2];
-    auto in_w = param.InputX()->dims()[3];
+  int out_h = param.OutH();
+  int out_w = param.OutW();
+  if (out_size_t != nullptr) {
+    auto out_size_data = out_size_t->data<int>();
+    out_h = out_size_data[0];
+    out_w = out_size_data[1];
+  }
+  auto* output = param.Out()->mutable_data<float>(
+      {out_dims[0], out_dims[1], out_h, out_w});
+  auto batch_size = param.InputX()->dims()[0];
+  auto channels = param.InputX()->dims()[1];
+  auto in_h = param.InputX()->dims()[2];
+  auto in_w = param.InputX()->dims()[3];
 
-    auto in_hw = in_h * in_w;
-    auto out_hw = out_h * out_w;
-    auto in_chw = channels * in_hw;
-    auto out_chw = channels * out_hw;
+  auto in_hw = in_h * in_w;
+  auto out_hw = out_h * out_w;
+  auto in_chw = channels * in_hw;
+  auto out_chw = channels * out_hw;
 
-    float ratio_h =
-            (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-            (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+  float ratio_h =
+      (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+  float ratio_w =
+      (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
 
-    if (in_h == out_h && in_w == out_w) {
-        memcpy(output, input, param.InputX()->numel() * sizeof(float));
-    } else {
-        for (int k = 0; k < batch_size; ++k) {  // loop for batches
-            for (int i = 0; i < out_h; ++i) {     // loop for images
-                int h = ratio_h * i;
-                int hid = (h < in_h - 1) ? 1 : 0;
-                float h1lambda = ratio_h * i - h;
-                float h2lambda = 1.f - h1lambda;
+  if (in_h == out_h && in_w == out_w) {
+    memcpy(output, input, param.InputX()->numel() * sizeof(float));
+  } else {
+    for (int k = 0; k < batch_size; ++k) {  // loop for batches
+      for (int i = 0; i < out_h; ++i) {     // loop for images
+        int h = ratio_h * i;
+        int hid = (h < in_h - 1) ? 1 : 0;
+        float h1lambda = ratio_h * i - h;
+        float h2lambda = 1.f - h1lambda;
 
-                for (int j = 0; j < out_w; ++j) {
-                    int w = ratio_w * j;
-                    int wid = (w < in_w - 1) ? 1 : 0;
-                    float w1lambda = ratio_w * j - w;
-                    float w2lambda = 1.f - w1lambda;
-                    // calculate four position for bilinear interpolation
-                    const float* in_pos = &input[k * in_chw + h * in_w + w];
-                    float* out_pos = &output[k * out_chw + i * out_w + j];
+        for (int j = 0; j < out_w; ++j) {
+          int w = ratio_w * j;
+          int wid = (w < in_w - 1) ? 1 : 0;
+          float w1lambda = ratio_w * j - w;
+          float w2lambda = 1.f - w1lambda;
+          // calculate four position for bilinear interpolation
+          const float* in_pos = &input[k * in_chw + h * in_w + w];
+          float* out_pos = &output[k * out_chw + i * out_w + j];
 
-                    for (int c = 0; c < channels; ++c) {  // loop for channels
-                        // bilinear interpolation
-                        out_pos[0] = static_cast<float>(
-                                h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
-                                h1lambda * (w2lambda * in_pos[hid * in_w] +
-                                            w1lambda * in_pos[hid * in_w + wid]));
-                        in_pos += in_hw;
-                        out_pos += out_hw;
-                    }
-                }
-            }
+          for (int c = 0; c < channels; ++c) {  // loop for channels
+            // bilinear interpolation
+            out_pos[0] = static_cast<float>(
+                h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
+                h1lambda * (w2lambda * in_pos[hid * in_w] +
+                            w1lambda * in_pos[hid * in_w + wid]));
+            in_pos += in_hw;
+            out_pos += out_hw;
+          }
         }
+      }
     }
-    
-    
-    
+  }
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/central-arm-func/flatten_arm_func.h b/src/operators/kernel/central-arm-func/flatten_arm_func.h
index 158844e44ed108c2d6e98d1992b90824a91bc297..8c803a20df10431dc54c00fc31fc17fcc8659d63 100644
--- a/src/operators/kernel/central-arm-func/flatten_arm_func.h
+++ b/src/operators/kernel/central-arm-func/flatten_arm_func.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #ifdef FLATTEN_OP
 #pragma once
 
+#include <operators/kernel/reshape_kernel.h>
 #include <vector>
+#include "operators/flatten_op.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -23,9 +25,18 @@ namespace operators {
 
 template <typename P>
 void FlattenCompute(const FlattenParam<CPU> &param) {
-  param.Out()->mutable_data<float>();
-  framework::TensorCopy(*param.InputX(), param.Out());
-  param.Out()->Resize(param.Out()->dims());
+  const auto *input_x = param.InputX();
+  const auto axis = param.Axis();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+
+  const auto &out_shape_v = GetOutputShape(axis, input_x_dims);
+  const framework::DDim &out_dim = ValidateShape(out_shape_v, input_x_dims);
+
+  out->Resize(out_dim);
+  out->mutable_data<float>();
+  framework::TensorCopy(*input_x, out);
+  out->Resize(out_dim);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/central-arm-func/shape_arm_func.h b/src/operators/kernel/central-arm-func/shape_arm_func.h
index 895877efd27cfe3e00ad62ebd3e0584eacc47ed7..fa9154211fe24ff8e1cc4966f9684f1fbf5a3111 100644
--- a/src/operators/kernel/central-arm-func/shape_arm_func.h
+++ b/src/operators/kernel/central-arm-func/shape_arm_func.h
@@ -23,7 +23,7 @@ namespace operators {
 
 template <typename P>
 void ShapeCompute(const ShapeParam<CPU>& param) {
-  auto* in_t = param.InputX();
+  auto* in_t = param.Input();
   auto* out_t = param.Out();
   auto out_data = out_t->mutable_data<int32_t>();
   auto in_dims = in_t->dims();
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index d6eaf4130540a4a519e7574776f12590b9d0a1df..91a3941880a8ecb4d11d4589ae860c185d5ed37a 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2366,7 +2366,7 @@ class ShapeParam : public OpParam {
     input_ = InputFrom<GType>(inputs, scope);
     out_ = OutFrom<GType>(outputs, scope);
   }
-  const RType *InputX() const { return input_; }
+  const RType *Input() const { return input_; }
   RType *Out() const { return out_; }
 
  private:
diff --git a/src/operators/shape_op.cpp b/src/operators/shape_op.cpp
index 55fbc80f5795e303605f645d8caaa6edc577c25c..b50a9c4507bff31ee753980c93917b93a4e1f42f 100644
--- a/src/operators/shape_op.cpp
+++ b/src/operators/shape_op.cpp
@@ -20,11 +20,11 @@ namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 void ShapeOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
+  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
                         "Input (Input) of get_shape op should not be null.");
   PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
                         "Output (Out) of get_shape op should not be null.");
-  this->param_.Out()->Resize(this->param_.InputX()->dims());
+  this->param_.Out()->Resize({this->param_.Input()->dims().size()});
 }
 
 }  // namespace operators
diff --git a/src/operators/split_op.cpp b/src/operators/split_op.cpp
index 4f33122976beb214f588f8647637166a6c4e84cd..8b7fadc1a64d1a6f7549e5875b543c871b385e6d 100644
--- a/src/operators/split_op.cpp
+++ b/src/operators/split_op.cpp
@@ -64,7 +64,7 @@ void SplitOp<DeviceType, T>::InferShape() const {
   PADDLE_MOBILE_ENFORCE(outs_dims.size() == outs.size(),
                         "length==dims.size()  must be true!");
   for (int j = 0; j < outs_dims.size(); ++j) {
-      outs[j]->Resize(outs_dims[j]);
+    outs[j]->Resize(outs_dims[j]);
   }
 
   //  todo lod impl
diff --git a/test/net/test_mobilenet_025_fssd.cpp b/test/net/test_mobilenet_025_fssd.cpp
index c1236d5bd6e91f17d7ded9b1709d52b9c3784aa6..ed27435a51a4aefee627149eff802121045d7c8c 100644
--- a/test/net/test_mobilenet_025_fssd.cpp
+++ b/test/net/test_mobilenet_025_fssd.cpp
@@ -23,7 +23,7 @@ int main() {
   //  ../../../test/models/mobilenet
   auto time1 = time();
   if (paddle_mobile.Load(std::string(g_fluid_fssd_new) + "/model",
-                         std::string(g_fluid_fssd_new) + "/params", false)) {
+                         std::string(g_fluid_fssd_new) + "/params", true)) {
     auto time2 = time();
     std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;