diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 11a803ff031706a10f282f21024915be68444546..45be0e4d3a7e1b7daff3fd226b53b06ad96fb73d 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -18,6 +18,7 @@ endif()
 
 set(ANDROID TRUE)
 add_definitions(-DLITE_WITH_LINUX)
+add_definitions(-DLITE_WITH_ANDROID)
 
 if(NOT DEFINED ANDROID_NDK)
     set(ANDROID_NDK $ENV{NDK_ROOT})
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 502b28d7b4c4e27276d9ac8880c9d46ee25191b1..1060602e12f5821a1c2f110d01a87d5fc6902704 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/api/cxx_api.h"
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -52,35 +53,36 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
 }
 
 // get inputs names
-std::vector<std::string> Predictor::GetInputNames() {
-  std::vector<std::string> input_names;
-  for (auto &item : input_names_) {
-    input_names.push_back(item.second);
-  }
-  return input_names;
+const std::vector<std::string> &Predictor::GetInputNames() {
+  return input_names_;
 }
 // get outputnames
-std::vector<std::string> Predictor::GetOutputNames() {
-  std::vector<std::string> output_names;
-  for (auto &item : output_names_) {
-    output_names.push_back(item.second);
-  }
-  return output_names;
+const std::vector<std::string> &Predictor::GetOutputNames() {
+  return output_names_;
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
   auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
+  std::vector<cpp::OpDesc *> feeds;
+  std::vector<cpp::OpDesc *> fetchs;
   for (int i = 0; i < current_block->OpsSize(); i++) {
     auto op = current_block->GetOp<cpp::OpDesc>(i);
     if (op->Type() == "feed") {
-      int idx = op->GetAttr<int>("col");
-      input_names_[idx] = op->Output("Out").front();
-      idx2feeds_[op->Output("Out").front()] = idx;
+      feeds.push_back(op);
     } else if (op->Type() == "fetch") {
-      int idx = op->GetAttr<int>("col");
-      output_names_[idx] = op->Input("X").front();
+      fetchs.push_back(op);
     }
   }
+  input_names_.resize(feeds.size());
+  output_names_.resize(fetchs.size());
+  for (int i = 0; i < feeds.size(); i++) {
+    input_names_[feeds[i]->GetAttr<int>("col")] =
+        feeds[i]->Output("Out").front();
+  }
+  for (int i = 0; i < fetchs.size(); i++) {
+    output_names_[fetchs[i]->GetAttr<int>("col")] =
+        fetchs[i]->Input("X").front();
+  }
 }
 
 const lite::Tensor *Predictor::GetOutput(size_t offset) const {
@@ -189,16 +191,17 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
 }
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
-  if (idx2feeds_.find(name) == idx2feeds_.end()) {
+  auto element = std::find(input_names_.begin(), input_names_.end(), name);
+  if (element == input_names_.end()) {
     LOG(ERROR) << "Model do not have input named with: [" << name
                << "], model's inputs include:";
     for (int i = 0; i < input_names_.size(); i++) {
       LOG(ERROR) << "[" << input_names_[i] << "]";
     }
-    return NULL;
+    return nullptr;
   } else {
-    int idx = idx2feeds_[name];
-    return GetInput(idx);
+    int position = std::distance(input_names_.begin(), element);
+    return GetInput(position);
   }
 }
 
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 3d8dc2f06aca24e23a77a0b32dc85a0959290758..7226f4767ddf91c2e8d9864e4bc7a7665845179a 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -74,8 +74,8 @@ class LITE_API Predictor {
   // get input by name.
   lite::Tensor* GetInputByName(const std::string& name);
   // get inputnames and get outputnames.
-  std::vector<std::string> GetInputNames();
-  std::vector<std::string> GetOutputNames();
+  const std::vector<std::string>& GetInputNames();
+  const std::vector<std::string>& GetOutputNames();
   void PrepareFeedFetch();
 
   // Get offset-th col of fetch results.
@@ -107,9 +107,8 @@ class LITE_API Predictor {
   const Scope* exec_scope_;
   std::unique_ptr<RuntimeProgram> program_;
   bool program_generated_{false};
-  std::map<size_t, std::string> input_names_;
-  std::map<std::string, size_t> idx2feeds_;
-  std::map<size_t, std::string> output_names_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
 };
 
 /*
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index b4fb3828f3b9b38aa3bcefc1df05d6453d55e771..62984ea476a901828367d74874291080667df3d8 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -37,8 +37,8 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
   std::string GetVersion() const override;
 
   // get inputs names and get outputs names
-  std::vector<std::string> GetInputNames() override;
-  std::vector<std::string> GetOutputNames() override;
+  const std::vector<std::string> &GetInputNames() override;
+  const std::vector<std::string> &GetOutputNames() override;
 
   std::unique_ptr<const lite_api::Tensor> GetTensor(
       const std::string &name) const override;
@@ -76,11 +76,11 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
   return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
 }
 
-std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
+const std::vector<std::string> &CxxPaddleApiImpl::GetInputNames() {
   return raw_predictor_.GetInputNames();
 }
 
-std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
+const std::vector<std::string> &CxxPaddleApiImpl::GetOutputNames() {
   return raw_predictor_.GetOutputNames();
 }
 
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index 12963285e482b2ea6c6e761f430699507d45c0c5..d28081c5152024606eb2e453aae1c7ca9eb7cd07 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/api/light_api.h"
+#include <algorithm>
 
 namespace paddle {
 namespace lite {
@@ -56,16 +57,17 @@ Tensor* LightPredictor::GetInput(size_t offset) {
 
 // get input by name
 Tensor* LightPredictor::GetInputByName(const std::string& name) {
-  if (idx2feeds_.find(name) == idx2feeds_.end()) {
+  auto element = std::find(input_names_.begin(), input_names_.end(), name);
+  if (element == input_names_.end()) {
     LOG(ERROR) << "Model do not have input named with: [" << name
                << "], model's inputs include:";
     for (int i = 0; i < input_names_.size(); i++) {
       LOG(ERROR) << "[" << input_names_[i] << "]";
     }
-    return NULL;
+    return nullptr;
   } else {
-    int idx = idx2feeds_[name];
-    return GetInput(idx);
+    int position = std::distance(input_names_.begin(), element);
+    return GetInput(position);
   }
 }
 
@@ -79,35 +81,36 @@ const Tensor* LightPredictor::GetOutput(size_t offset) {
   return out_var->GetMutable<lite::Tensor>();
 }
 // get inputs names
-std::vector<std::string> LightPredictor::GetInputNames() {
-  std::vector<std::string> input_names;
-  for (auto& item : input_names_) {
-    input_names.push_back(item.second);
-  }
-  return input_names;
+const std::vector<std::string>& LightPredictor::GetInputNames() {
+  return input_names_;
 }
 // get outputnames
-std::vector<std::string> LightPredictor::GetOutputNames() {
-  std::vector<std::string> output_names;
-  for (auto& item : output_names_) {
-    output_names.push_back(item.second);
-  }
-  return output_names;
+const std::vector<std::string>& LightPredictor::GetOutputNames() {
+  return output_names_;
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
   auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
+  std::vector<cpp::OpDesc*> feeds;
+  std::vector<cpp::OpDesc*> fetchs;
   for (int i = 0; i < current_block->OpsSize(); i++) {
     auto op = current_block->GetOp<cpp::OpDesc>(i);
     if (op->Type() == "feed") {
-      int idx = op->GetAttr<int>("col");
-      input_names_[idx] = op->Output("Out").front();
-      idx2feeds_[op->Output("Out").front()] = idx;
+      feeds.push_back(op);
     } else if (op->Type() == "fetch") {
-      int idx = op->GetAttr<int>("col");
-      output_names_[idx] = op->Input("X").front();
+      fetchs.push_back(op);
     }
   }
+  input_names_.resize(feeds.size());
+  output_names_.resize(fetchs.size());
+  for (int i = 0; i < feeds.size(); i++) {
+    input_names_[feeds[i]->GetAttr<int>("col")] =
+        feeds[i]->Output("Out").front();
+  }
+  for (int i = 0; i < fetchs.size(); i++) {
+    output_names_[fetchs[i]->GetAttr<int>("col")] =
+        fetchs[i]->Input("X").front();
+  }
 }
 
 void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
diff --git a/lite/api/light_api.h b/lite/api/light_api.h
index 0705e0aba42373dec9f1387573024c5b3bb98bbc..9d69cce441f86e563ad3ed0501514ab1fe79d98e 100644
--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -64,8 +64,8 @@ class LITE_API LightPredictor {
   }
 
   // get inputnames and get outputnames.
-  std::vector<std::string> GetInputNames();
-  std::vector<std::string> GetOutputNames();
+  const std::vector<std::string>& GetInputNames();
+  const std::vector<std::string>& GetOutputNames();
   void PrepareFeedFetch();
 
  private:
@@ -82,9 +82,8 @@ class LITE_API LightPredictor {
   std::shared_ptr<Scope> scope_;
   std::unique_ptr<RuntimeProgram> program_;
   cpp::ProgramDesc cpp_program_desc_;
-  std::map<size_t, std::string> input_names_;
-  std::map<std::string, size_t> idx2feeds_;
-  std::map<size_t, std::string> output_names_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
 };
 
 }  // namespace lite
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index 90e1397d8338adb1ba732fc322ae03520bcce27f..70ab8ac0c03b8dea84da5ef1d6ca9c64c4c9d102 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -32,8 +32,8 @@ class LightPredictorImpl : public PaddlePredictor {
   void Run() override;
 
   std::string GetVersion() const override;
-  std::vector<std::string> GetInputNames() override;
-  std::vector<std::string> GetOutputNames() override;
+  const std::vector<std::string>& GetInputNames() override;
+  const std::vector<std::string>& GetOutputNames() override;
 
   std::unique_ptr<const Tensor> GetTensor(
       const std::string& name) const override;
@@ -78,11 +78,11 @@ std::unique_ptr<Tensor> LightPredictorImpl::GetInputByName(
       new Tensor(raw_predictor_->GetInputByName(name)));
 }
 
-std::vector<std::string> LightPredictorImpl::GetInputNames() {
+const std::vector<std::string>& LightPredictorImpl::GetInputNames() {
   return raw_predictor_->GetInputNames();
 }
 
-std::vector<std::string> LightPredictorImpl::GetOutputNames() {
+const std::vector<std::string>& LightPredictorImpl::GetOutputNames() {
   return raw_predictor_->GetOutputNames();
 }
 
diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc
index 418d97e9e8814b5e6e90a76cbdb6e92677c9c726..d2bbc295ad4b68e7849d5d25f34e0b5117fc846d 100644
--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
@@ -36,12 +36,14 @@ TEST(LightAPI, load) {
     data[i] = i;
   }
 
-  std::vector<std::string> inputs = predictor.GetInputNames();
+  predictor.PrepareFeedFetch();
+  const std::vector<std::string>& inputs = predictor.GetInputNames();
+
   LOG(INFO) << "input size: " << inputs.size();
   for (int i = 0; i < inputs.size(); i++) {
     LOG(INFO) << "inputnames: " << inputs[i];
   }
-  std::vector<std::string> outputs = predictor.GetOutputNames();
+  const std::vector<std::string>& outputs = predictor.GetOutputNames();
   for (int i = 0; i < outputs.size(); i++) {
     LOG(INFO) << "outputnames: " << outputs[i];
   }
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 545ae03f6725de7649b3278835bda973ade2755e..d7e3c014b0fe37a5f1da4210972349ac4124ed6b 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -75,9 +75,9 @@ class LITE_API PaddlePredictor {
   virtual std::string GetVersion() const = 0;
 
   // Get input names
-  virtual std::vector<std::string> GetInputNames() = 0;
+  virtual const std::vector<std::string>& GetInputNames() = 0;
   // Get output names
-  virtual std::vector<std::string> GetOutputNames() = 0;
+  virtual const std::vector<std::string>& GetOutputNames() = 0;
 
   // Get Input by name
   virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
index 63142d49814473e6dc9ee6e553d95fa86b4058c5..443a05d9927cfa461a306ce6c3c32ff6e5024631 100644
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -37,12 +37,12 @@ TEST(CxxApi, run) {
 
   LOG(INFO) << "Version: " << predictor->GetVersion();
 
-  std::vector<std::string> inputs = predictor->GetInputNames();
+  auto& inputs = predictor->GetInputNames();
   LOG(INFO) << "input size: " << inputs.size();
   for (int i = 0; i < inputs.size(); i++) {
     LOG(INFO) << "inputnames: " << inputs[i];
   }
-  std::vector<std::string> outputs = predictor->GetOutputNames();
+  auto& outputs = predictor->GetOutputNames();
   for (int i = 0; i < outputs.size(); i++) {
     LOG(INFO) << "outputnames: " << outputs[i];
   }
@@ -76,14 +76,14 @@ TEST(LightApi, run) {
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
-  std::vector<std::string> inputs = predictor->GetInputNames();
+  auto& inputs = predictor->GetInputNames();
   LOG(INFO) << "input size: " << inputs.size();
   for (int i = 0; i < inputs.size(); i++) {
-    LOG(INFO) << "inputnames: " << inputs[i];
+    LOG(INFO) << "inputnames: " << inputs.at(i);
   }
-  std::vector<std::string> outputs = predictor->GetOutputNames();
+  auto& outputs = predictor->GetOutputNames();
   for (int i = 0; i < outputs.size(); i++) {
-    LOG(INFO) << "outputnames: " << outputs[i];
+    LOG(INFO) << "outputnames: " << outputs.at(i);
   }
 
   LOG(INFO) << "Version: " << predictor->GetVersion();
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index c150b2b1776a7978821286c3ca4e311e75c251a3..896f6c8d33a8665c4c94786dd08af1a097942608 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -35,6 +35,9 @@
 #include <sys/syscall.h>
 #include <unistd.h>
 #endif
+#ifdef LITE_WITH_ANDROID
+#include <sys/system_properties.h>
+#endif
 #if __APPLE__
 #include "TargetConditionals.h"
 #if LITE_WITH_IPHONE
@@ -218,6 +221,7 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
 #ifdef LITE_WITH_LINUX
 
 std::string get_cpu_name() {
+  std::string cpu_name;
   FILE* fp = fopen("/proc/cpuinfo", "rb");
   if (!fp) {
     return "";
@@ -229,12 +233,23 @@ std::string get_cpu_name() {
       break;
     }
     if (strstr(line, "Hardware") != NULL) {
-      fclose(fp);
-      return std::string(line);
+      cpu_name = std::string(line);
     }
   }
+#ifdef LITE_WITH_ANDROID
+  // cpu name concat board name, platform name and chip name
+  char board_name[128];
+  char platform_name[128];
+  char chip_name[128];
+  __system_property_get("ro.product.board", board_name);
+  __system_property_get("ro.board.platform", platform_name);
+  __system_property_get("ro.chipname", chip_name);
+  cpu_name =
+      cpu_name + "_" + board_name + "_" + platform_name + "_" + chip_name;
+#endif
+  std::transform(cpu_name.begin(), cpu_name.end(), cpu_name.begin(), ::toupper);
   fclose(fp);
-  return "";
+  return cpu_name;
 }
 
 int get_min_freq_khz(int cpuid) {
@@ -780,7 +795,9 @@ bool DeviceInfo::SetCPUInfoByName() {
     cluster_ids_ = {0, 0, 0, 0};
     SetArchInfo(1, kA53);
     return true;
-  } else if (dev_name_.find("KIRIN980") != std::string::npos) {  // Kirin 980
+  } else if (dev_name_.find("KIRIN980") != std::string::npos ||
+             dev_name_.find("KIRIN990") !=
+                 std::string::npos) {  // Kirin 980, Kirin 990
     core_num_ = 8;
     core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
     big_core_ids_ = {4, 5, 6, 7};
@@ -1109,7 +1126,8 @@ void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
 }
 
 bool DeviceInfo::ExtendWorkspace(size_t size) {
-  workspace_.Resize({size + llc_size()});
+  workspace_.Resize(
+      {static_cast<int64_t>(size + static_cast<size_t>(llc_size()))});
   return workspace_.mutable_data<int8_t>() != nullptr;
 }
 
diff --git a/lite/core/types.cc b/lite/core/types.cc
index ec89e83e5808fb85803adea0555c76b7e424424c..4ea383333d519ac2c481dce459ca49124a64df32 100644
--- a/lite/core/types.cc
+++ b/lite/core/types.cc
@@ -82,6 +82,10 @@ Type StdTypeToRepr<double>() {
   return Type::_float64;
 }
 template <>
+Type StdTypeToRepr<std::vector<char>>() {
+  return Type::_char_list;
+}
+template <>
 Type StdTypeToRepr<std::string>() {
   return Type::_string;
 }
diff --git a/lite/core/types.h b/lite/core/types.h
index efb8a096e569e296f71737ca3d26cb77eb71e62c..8f154f9dd509d3627750ecbf301923a2296252d1 100644
--- a/lite/core/types.h
+++ b/lite/core/types.h
@@ -16,6 +16,7 @@
 
 #include <stack>
 #include <string>
+#include <vector>
 #include "lite/api/paddle_place.h"
 #include "lite/utils/all.h"
 
@@ -36,7 +37,9 @@ enum class Type {
   _float64,
   _bool,
   _string,
-  // primary list types
+  // primary list type
+  _char_list,
+  // list types
   _list,
   // enum type
   _enum,
@@ -89,6 +92,8 @@ Type StdTypeToRepr<float>();
 template <>
 Type StdTypeToRepr<bool>();
 template <>
+Type StdTypeToRepr<std::vector<char>>();
+template <>
 Type StdTypeToRepr<std::string>();
 
 // Factors that impact the kernel picking strategy. Multiple factors can be
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index 98007db0d188b8a77477a5148224be71f5b00dd5..ebb96e21d5e856325b7abdb8342df2aea3d5b5c3 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -39,6 +39,13 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   int pad = param.paddings[0];
   int stride = param.strides[0];
 
+  int chin = param.x->dims()[1];
+  int hin = param.x->dims()[2];
+  int win = param.x->dims()[3];
+  int chout = param.output->dims()[1];
+  int hout = param.output->dims()[2];
+  int wout = param.output->dims()[3];
+
   bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
                    (param.strides[0] == param.strides[1]) && (kw == kh);
   bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
@@ -54,7 +61,7 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
     VLOG(3) << "invoking dw conv";
   } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
              no_dilation) {
-    if (ic >= 32 && oc >= 32) {
+    if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) {
       /// winograd conv impl
       impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
       VLOG(3) << "invoking winograd conv";
@@ -63,8 +70,8 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
       impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
       VLOG(3) << "invoking direct conv";
     }
-  } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal &&
-             no_dilation) {
+  } else if (param.groups == 1 && kw == 3 && stride == 2 &&
+             chin * chout < 4 * hin * win && kps_equal && no_dilation) {
     /// direct conv impl
     impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
     VLOG(3) << "invoking direct conv";
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index d855ee8e36b8babc40e4820ccd2b19d0b1008d34..67f55881ce4010d1179d9b6013aa560c56dd949e 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -32,6 +32,8 @@ nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_c
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
 nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
 nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
+nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
+nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
 nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
diff --git a/lite/kernels/cuda/pool_compute_test.cc b/lite/kernels/cuda/pool_compute_test.cc
index fafd1ef0c8d449c84c417023fbb81e8d7c3bb43f..fe6ff92c0ce943cad36fbdd4f1408e344d9fd5fd 100644
--- a/lite/kernels/cuda/pool_compute_test.cc
+++ b/lite/kernels/cuda/pool_compute_test.cc
@@ -194,9 +194,9 @@ TEST(pool_cuda, compute) {
             for (auto stride : {1, 2}) {
               for (auto pad : {0, 1}) {
                 for (auto n : {1, 2}) {
-                  for (auto c : {1, 3, 256}) {
-                    for (auto h : {2, 3, 4, 6, 13}) {
-                      for (auto w : {2, 3, 4, 6, 13}) {
+                  for (auto c : {1, 3}) {
+                    for (auto h : {2, 3, 4, 11}) {
+                      for (auto w : {2, 3, 4, 11}) {
                         VLOG(3) << "n:" << n << " c:" << c << " h:" << h
                                 << " w:" << w << " ksize:" << ksize
                                 << " stride:" << stride << " pad:" << pad
diff --git a/lite/kernels/cuda/softmax_compute_test.cc b/lite/kernels/cuda/softmax_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4d53520911a4868c73d7806fcc1bb5bf8bf33df
--- /dev/null
+++ b/lite/kernels/cuda/softmax_compute_test.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/softmax_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+using DDim = lite::DDim;
+
+template <typename dtype>
+static void softmax_compute_ref(const operators::SoftmaxParam& param) {
+  const dtype* x_data = param.x->mutable_data<const dtype>();
+  dtype* output_data = param.output->mutable_data<dtype>();
+  DDim x_dims = param.x->dims();
+  ASSERT_EQ(x_dims.data(), param.output->dims().data());
+  auto x_rank = x_dims.size();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis += x_rank;
+  }
+  int axis_size = x_dims[axis];
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+  int compute_size = outer_num * inner_num;
+  for (int i = 0; i < compute_size; i++) {
+    int idx_inner = i % inner_num;
+    int idx_outer = (i / inner_num) * axis_size;
+    int start = idx_outer * inner_num + idx_inner;
+    int offset;
+
+    offset = start;
+    dtype max_data = std::numeric_limits<dtype>::lowest();
+    for (int j = 0; j < axis_size; j++) {
+      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
+      offset += inner_num;
+    }
+
+    offset = start;
+    dtype sum_data = (dtype)0;
+    for (int j = 0; j < axis_size; j++) {
+      output_data[offset] = exp(x_data[offset] - max_data);
+      sum_data += output_data[offset];
+      offset += inner_num;
+    }
+
+    offset = start;
+    for (int j = 0; j < axis_size; j++) {
+      output_data[offset] /= sum_data;
+      offset += inner_num;
+    }
+  }
+}
+
+TEST(softmax_cuda, compute) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  SoftmaxCompute softmax;
+  operators::SoftmaxParam param;
+  softmax.SetContext(std::move(ctx));
+  lite::Tensor x;
+  lite::Tensor x_cpu;
+  lite::Tensor output;
+  lite::Tensor output_cpu;
+  lite::Tensor output_ref;
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 4}) {
+      for (auto h : {5, 1, 112}) {
+        for (auto w : {1, 6, 112}) {
+          for (auto axis : {-2, -1, 0, 1, 2}) {
+            x.Resize({n, c, h, w});
+            x_cpu.Resize({n, c, h, w});
+            output.Resize({n, c, h, w});
+            output_cpu.Resize({n, c, h, w});
+            output_ref.Resize({n, c, h, w});
+            auto* x_cpu_data = x_cpu.mutable_data<float>();
+            auto* output_data = output.mutable_data<float>(TARGET(kCUDA));
+            auto* output_cpu_data = output_ref.mutable_data<float>();
+            auto* output_ref_data = output_ref.mutable_data<float>();
+            for (int i = 0; i < x.dims().production(); i++) {
+              x_cpu_data[i] = i;
+            }
+            x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data,
+                                                       x_cpu.dims());
+            param.x = &x;
+            param.axis = axis;
+            param.output = &output;
+            softmax.SetParam(param);
+            softmax.Launch();
+            param.x = &x_cpu;
+            param.output = &output_ref;
+            softmax_compute_ref<float>(param);
+            cudaDeviceSynchronize();
+            CopySync<TARGET(kCUDA)>(output_cpu_data,
+                                    output_data,
+                                    sizeof(float) * output.numel(),
+                                    IoDirection::DtoH);
+            for (int i = 0; i < output.dims().production(); i++) {
+              EXPECT_NEAR(output_cpu_data[i], output_ref_data[i], 1e-5);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/yolo_box_compute_test.cc b/lite/kernels/cuda/yolo_box_compute_test.cc
index 26b890c9f127a49fff42f102e7bbdab3fffb042a..994251b249e7dc6d8ae8870937c34cfa0323fd22 100644
--- a/lite/kernels/cuda/yolo_box_compute_test.cc
+++ b/lite/kernels/cuda/yolo_box_compute_test.cc
@@ -89,7 +89,7 @@ inline static void calc_label_score(float* scores,
 
 template <typename T>
 static void YoloBoxRef(const T* input,
-                       const T* imgsize,
+                       const int* imgsize,
                        T* boxes,
                        T* scores,
                        const float conf_thresh,
@@ -106,8 +106,8 @@ static void YoloBoxRef(const T* input,
   float box[4];
 
   for (int i = 0; i < n; i++) {
-    int img_height = static_cast<int>(imgsize[2 * i]);
-    int img_width = static_cast<int>(imgsize[2 * i + 1]);
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
 
     for (int j = 0; j < an_num; j++) {
       for (int k = 0; k < h; k++) {
@@ -184,12 +184,12 @@ TEST(yolo_box, normal) {
   auto* scores_data = scores.mutable_data<float>(TARGET(kCUDA));
 
   float* x_cpu_data = x_cpu.mutable_data<float>();
-  float* sz_cpu_data = sz_cpu.mutable_data<float>();
+  int* sz_cpu_data = sz_cpu.mutable_data<int>();
   float* boxes_cpu_data = boxes_cpu.mutable_data<float>();
   float* scores_cpu_data = scores_cpu.mutable_data<float>();
 
   float* x_ref_data = x_ref.mutable_data<float>();
-  float* sz_ref_data = sz_ref.mutable_data<float>();
+  int* sz_ref_data = sz_ref.mutable_data<int>();
   float* boxes_ref_data = boxes_ref.mutable_data<float>();
   float* scores_ref_data = scores_ref.mutable_data<float>();
 
@@ -203,7 +203,7 @@ TEST(yolo_box, normal) {
   sz_ref_data[1] = 32;
 
   x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  sz.Assign<float, lite::DDim, TARGET(kCUDA)>(sz_cpu_data, sz_cpu.dims());
+  sz.Assign<int, lite::DDim, TARGET(kCUDA)>(sz_cpu_data, sz_cpu.dims());
 
   param.X = &x;
   param.ImgSize = &sz;
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index fd82350ddd27e9ae4386a02428c6cc097ab2fe1d..13b6cb5b77d00a2a5f733a0015dec4dbebc088b7 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -727,10 +727,8 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
 
   // Load model
 
-  std::string prog_path = model_buffer;
-
   naive_buffer::BinaryTable table;
-  table.LoadFromMemory(prog_path.c_str(), prog_path.length());
+  table.LoadFromMemory(model_buffer.c_str(), model_buffer.length());
 
   naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
   nb_proto_prog.Load();
@@ -742,8 +740,7 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
   // Load Params
   // NOTE: Only main block be used now.
   // only combined Params are supported in Loading Model from memory
-  std::string combined_params_path = param_buffer;
-  LoadCombinedParamsNaive(combined_params_path, scope, *cpp_prog, true);
+  LoadCombinedParamsNaive(param_buffer, scope, *cpp_prog, true);
 
   VLOG(4) << "Load model from naive buffer memory successfully";
 }
diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h
index e2e2f7fb1ea3cb5b226bf09bd16074f51e171c75..717dd3c5a6b0c48d6a1f2ae0d7dba9f08a6d99f3 100644
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
@@ -126,6 +126,41 @@ using UInt64Builder = PrimaryBuilder<uint64_t>;
 using Float32Builder = PrimaryBuilder<float>;
 using Float64Builder = PrimaryBuilder<double>;
 
+template <typename Primary>
+class PrimaryListBuilder : public FieldBuilder {
+  std::vector<Primary> data_;
+
+ public:
+  using value_type = Primary;
+
+  explicit PrimaryListBuilder(BinaryTable* table) : FieldBuilder(table) {}
+  PrimaryListBuilder(BinaryTable* table, const std::vector<Primary>& val)
+      : FieldBuilder(table), data_(val) {}
+
+  /// Set data.
+  void set(const std::vector<Primary>& x) { data_ = x; }
+
+  const std::vector<Primary>& data() const { return data_; }
+
+  /// Save information to the corresponding BinaryTable.
+  void Save() override;
+
+  /// Load information from the corresponding BinaryTable.
+  void Load() override;
+
+  /// Number of elements.
+  size_t size() const { return data_.size(); }
+
+  Type type() const override {
+    return core::StdTypeToRepr<std::vector<Primary>>();
+  }
+
+  /// clear builder
+  void Clear() { data_.clear(); }
+
+  ~PrimaryListBuilder() = default;
+};
+
 /*
  * Builder for all the primary types. int32, float, bool and so on.
  */
@@ -344,6 +379,36 @@ void PrimaryBuilder<Primary>::Load() {
   table()->Consume(sizeof(value_type));
 }
 
+template <typename Primary>
+void PrimaryListBuilder<Primary>::Load() {
+  CHECK(data_.empty()) << "Duplicate load";
+  // Load number of elements first.
+  uint64_t num_elems{};
+  memcpy(&num_elems, table()->cursor(), sizeof(uint64_t));
+  table()->Consume(sizeof(uint64_t));
+
+  data_.resize(num_elems);
+  for (uint64_t i = 0; i < num_elems; i++) {
+    memcpy(&data_[i], table()->cursor(), sizeof(value_type));
+    table()->Consume(sizeof(value_type));
+  }
+}
+
+template <typename Primary>
+void PrimaryListBuilder<Primary>::Save() {
+  // store number of elements in the head.
+  uint64_t num_elems = size();
+  table()->Require(sizeof(uint64_t));
+  memcpy(table()->cursor(), &num_elems, sizeof(uint64_t));
+  table()->Consume(sizeof(uint64_t));
+
+  table()->Require(num_elems * sizeof(value_type));
+  memcpy(table()->cursor(),
+         reinterpret_cast<byte_t*>(&data_[0]),
+         num_elems * sizeof(value_type));
+  table()->Consume(num_elems * sizeof(value_type));
+}
+
 template <typename EnumType>
 void EnumBuilder<EnumType>::Save() {
   value_type holder = static_cast<value_type>(data_);
diff --git a/lite/model_parser/naive_buffer/param_desc.cc b/lite/model_parser/naive_buffer/param_desc.cc
index d7e2b4caec062988e8f20486402a05f7b7c27143..4397b3c413e8a09d2e5e5b41b8f9222bcfab4e20 100644
--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ b/lite/model_parser/naive_buffer/param_desc.cc
@@ -149,15 +149,16 @@ void ParamDesc::SetDim(const std::vector<int64_t>& dim) {
     CHECK(GetDataType() == VarDescAPI::VarDataType::type__)                 \
         << "Data Type mismatch";                                            \
     std::vector<T> res;                                                     \
-    auto& data_builder = desc_->GetField<ListBuilder<CharBuilder>>("data"); \
-    auto data = RepeatedToVector<char, CharBuilder>(data_builder);          \
+    auto& data_builder = desc_->GetField<PrimaryListBuilder<char>>("data"); \
+    auto& data = data_builder.data();                                       \
     size_t size = data.size() / sizeof(T);                                  \
-    auto* data_ptr = reinterpret_cast<T*>(&data[0]);                        \
+    auto* data_ptr = reinterpret_cast<const T*>(&data[0]);                  \
     for (size_t i = 0; i < size; ++i) {                                     \
       res.push_back(data_ptr[i]);                                           \
     }                                                                       \
     return res;                                                             \
   }
+
 GET_DATA_IMPL(uint8_t, UINT8);
 GET_DATA_IMPL(int8_t, INT8);
 GET_DATA_IMPL(int16_t, INT16);
@@ -172,14 +173,13 @@ GET_DATA_IMPL(double, FP64);
   CHECK(GetDataType() == VarDescAPI::VarDataType::type__)       \
       << "Data Type mismatch, call SetDataType first.";         \
   auto* data_builder =                                          \
-      desc_->GetMutableField<ListBuilder<CharBuilder>>("data"); \
+      desc_->GetMutableField<PrimaryListBuilder<char>>("data"); \
   CHECK(data_builder);                                          \
   data_builder->Clear();                                        \
   size_t size = size__ * sizeof(T);                             \
   auto* data_ptr = reinterpret_cast<const char*>(data_ptr__);   \
-  for (size_t i = 0; i < size; ++i) {                           \
-    data_builder->New()->set(data_ptr[i]);                      \
-  }
+  std::vector<char> data_vec(data_ptr, data_ptr + size);        \
+  data_builder->set(data_vec);
 
 #define SET_DATA_IMPL(T, type__)                                \
   template <>                                                   \
diff --git a/lite/model_parser/naive_buffer/proto/framework.nb.h b/lite/model_parser/naive_buffer/proto/framework.nb.h
index f495a12b460c57e2464a76409d69778f4e2754a8..2427e49d2690811ded0a19d7a7bd6dec1ef6394a 100644
--- a/lite/model_parser/naive_buffer/proto/framework.nb.h
+++ b/lite/model_parser/naive_buffer/proto/framework.nb.h
@@ -191,7 +191,7 @@ class ParamDesc : public StructBuilder {
     New<lod_type>("lod");
     NewUInt32("tensor_version");
     New<TensorDesc>("tensor_desc");
-    New<ListBuilder<CharBuilder>>("data");
+    New<PrimaryListBuilder<char>>("data");
   }
 };
 
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index 10dff5371a0f6840e092287d97eff98722e3b7f7..668419cf7ceae4a2e10cd447d57824f826cabd3a 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/operators/conv_op.h"
+#include <algorithm>
 #include <vector>
 #include "lite/core/op_registry.h"
 
@@ -51,10 +52,41 @@ inline int ConvOutputSize(
   return output_size;
 }
 
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                   (int64_t)0);
+      // pad
+      *(paddings->begin() + i) = pad_sum / 2;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 bool ConvOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
+  UpdatePaddingAndDilation(&param_.paddings,
+                           &param_.dilations,
+                           param_.strides,
+                           padding_algorithm_,
+                           in_dims,
+                           filter_dims);
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < param_.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 2],
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index ac0006c8e6f495d36991cf712c3c80dfcf7a46c9..1d6e1c93490a394723d34de76fc3ff8040d31e81 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -93,6 +93,10 @@ class ConvOpLite : public OpLite {
             << "The fused conv only supports fuse with relu and leaky relu";
       }
     }
+
+    if (op_desc.HasAttr("padding_algorithm")) {
+      padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+    }
     // For Int8
     if (op_desc.HasAttr("enable_int8")) {
       param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
@@ -114,6 +118,7 @@ class ConvOpLite : public OpLite {
 
  private:
   mutable ConvParam param_;
+  std::string padding_algorithm_{""};
 };
 
 }  // namespace operators