Merge branch 'develop' into gpu

c74a68b7 · myq406450149 · GitHub · 317bd91c · 305130fc · c74a68b7
24 changed file
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -18,6 +18,7 @@ endif()

 set(ANDROID TRUE)
 add_definitions(-DLITE_WITH_LINUX)
+add_definitions(-DLITE_WITH_ANDROID)

 if(NOT DEFINED ANDROID_NDK)
    set(ANDROID_NDK $ENV{NDK_ROOT})

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/api/cxx_api.h"
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -52,35 +53,36 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
 }

 // get inputs names
-std::vector<std::string> Predictor::GetInputNames() {
-  std::vector<std::string> input_names;
-  for (auto &item : input_names_) {
-    input_names.push_back(item.second);
-  }
-  return input_names;
+const std::vector<std::string> &Predictor::GetInputNames() {
+  return input_names_;
 }
 // get outputnames
-std::vector<std::string> Predictor::GetOutputNames() {
-  std::vector<std::string> output_names;
-  for (auto &item : output_names_) {
-    output_names.push_back(item.second);
-  }
-  return output_names;
+const std::vector<std::string> &Predictor::GetOutputNames() {
+  return output_names_;
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
+  std::vector<cpp::OpDesc *> feeds;
+  std::vector<cpp::OpDesc *> fetchs;
  for (int i = 0; i < current_block->OpsSize(); i++) {
    auto op = current_block->GetOp<cpp::OpDesc>(i);
    if (op->Type() == "feed") {
-      int idx = op->GetAttr<int>("col");
-      input_names_[idx] = op->Output("Out").front();
-      idx2feeds_[op->Output("Out").front()] = idx;
+      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
-      int idx = op->GetAttr<int>("col");
-      output_names_[idx] = op->Input("X").front();
+      fetchs.push_back(op);
    }
  }
+  input_names_.resize(feeds.size());
+  output_names_.resize(fetchs.size());
+  for (int i = 0; i < feeds.size(); i++) {
+    input_names_[feeds[i]->GetAttr<int>("col")] =
+        feeds[i]->Output("Out").front();
+  }
+  for (int i = 0; i < fetchs.size(); i++) {
+    output_names_[fetchs[i]->GetAttr<int>("col")] =
+        fetchs[i]->Input("X").front();
+  }
 }

 const lite::Tensor *Predictor::GetOutput(size_t offset) const {
@@ -189,16 +191,17 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
 }
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
-  if (idx2feeds_.find(name) == idx2feeds_.end()) {
+  auto element = std::find(input_names_.begin(), input_names_.end(), name);
+  if (element == input_names_.end()) {
    LOG(ERROR) << "Model do not have input named with: [" << name
               << "], model's inputs include:";
    for (int i = 0; i < input_names_.size(); i++) {
      LOG(ERROR) << "[" << input_names_[i] << "]";
    }
-    return NULL;
+    return nullptr;
  } else {
-    int idx = idx2feeds_[name];
-    return GetInput(idx);
+    int position = std::distance(input_names_.begin(), element);
+    return GetInput(position);
  }
 }


--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -74,8 +74,8 @@ class LITE_API Predictor {
  // get input by name.
  lite::Tensor* GetInputByName(const std::string& name);
  // get inputnames and get outputnames.
-  std::vector<std::string> GetInputNames();
-  std::vector<std::string> GetOutputNames();
+  const std::vector<std::string>& GetInputNames();
+  const std::vector<std::string>& GetOutputNames();
  void PrepareFeedFetch();

  // Get offset-th col of fetch results.
@@ -107,9 +107,8 @@ class LITE_API Predictor {
  const Scope* exec_scope_;
  std::unique_ptr<RuntimeProgram> program_;
  bool program_generated_{false};
-  std::map<size_t, std::string> input_names_;
-  std::map<std::string, size_t> idx2feeds_;
-  std::map<size_t, std::string> output_names_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
 };

 /*

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -37,8 +37,8 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
  std::string GetVersion() const override;

  // get inputs names and get outputs names
-  std::vector<std::string> GetInputNames() override;
-  std::vector<std::string> GetOutputNames() override;
+  const std::vector<std::string> &GetInputNames() override;
+  const std::vector<std::string> &GetOutputNames() override;

  std::unique_ptr<const lite_api::Tensor> GetTensor(
      const std::string &name) const override;
@@ -76,11 +76,11 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
 }

-std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
+const std::vector<std::string> &CxxPaddleApiImpl::GetInputNames() {
  return raw_predictor_.GetInputNames();
 }

-std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
+const std::vector<std::string> &CxxPaddleApiImpl::GetOutputNames() {
  return raw_predictor_.GetOutputNames();
 }


--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/api/light_api.h"
+#include <algorithm>

 namespace paddle {
 namespace lite {
@@ -56,16 +57,17 @@ Tensor* LightPredictor::GetInput(size_t offset) {

 // get input by name
 Tensor* LightPredictor::GetInputByName(const std::string& name) {
-  if (idx2feeds_.find(name) == idx2feeds_.end()) {
+  auto element = std::find(input_names_.begin(), input_names_.end(), name);
+  if (element == input_names_.end()) {
    LOG(ERROR) << "Model do not have input named with: [" << name
               << "], model's inputs include:";
    for (int i = 0; i < input_names_.size(); i++) {
      LOG(ERROR) << "[" << input_names_[i] << "]";
    }
-    return NULL;
+    return nullptr;
  } else {
-    int idx = idx2feeds_[name];
-    return GetInput(idx);
+    int position = std::distance(input_names_.begin(), element);
+    return GetInput(position);
  }
 }

@@ -79,35 +81,36 @@ const Tensor* LightPredictor::GetOutput(size_t offset) {
  return out_var->GetMutable<lite::Tensor>();
 }
 // get inputs names
-std::vector<std::string> LightPredictor::GetInputNames() {
-  std::vector<std::string> input_names;
-  for (auto& item : input_names_) {
-    input_names.push_back(item.second);
-  }
-  return input_names;
+const std::vector<std::string>& LightPredictor::GetInputNames() {
+  return input_names_;
 }
 // get outputnames
-std::vector<std::string> LightPredictor::GetOutputNames() {
-  std::vector<std::string> output_names;
-  for (auto& item : output_names_) {
-    output_names.push_back(item.second);
-  }
-  return output_names;
+const std::vector<std::string>& LightPredictor::GetOutputNames() {
+  return output_names_;
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
+  std::vector<cpp::OpDesc*> feeds;
+  std::vector<cpp::OpDesc*> fetchs;
  for (int i = 0; i < current_block->OpsSize(); i++) {
    auto op = current_block->GetOp<cpp::OpDesc>(i);
    if (op->Type() == "feed") {
-      int idx = op->GetAttr<int>("col");
-      input_names_[idx] = op->Output("Out").front();
-      idx2feeds_[op->Output("Out").front()] = idx;
+      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
-      int idx = op->GetAttr<int>("col");
-      output_names_[idx] = op->Input("X").front();
+      fetchs.push_back(op);
    }
  }
+  input_names_.resize(feeds.size());
+  output_names_.resize(fetchs.size());
+  for (int i = 0; i < feeds.size(); i++) {
+    input_names_[feeds[i]->GetAttr<int>("col")] =
+        feeds[i]->Output("Out").front();
+  }
+  for (int i = 0; i < fetchs.size(); i++) {
+    output_names_[fetchs[i]->GetAttr<int>("col")] =
+        fetchs[i]->Input("X").front();
+  }
 }

 void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -64,8 +64,8 @@ class LITE_API LightPredictor {
  }

  // get inputnames and get outputnames.
-  std::vector<std::string> GetInputNames();
-  std::vector<std::string> GetOutputNames();
+  const std::vector<std::string>& GetInputNames();
+  const std::vector<std::string>& GetOutputNames();
  void PrepareFeedFetch();

 private:
@@ -82,9 +82,8 @@ class LITE_API LightPredictor {
  std::shared_ptr<Scope> scope_;
  std::unique_ptr<RuntimeProgram> program_;
  cpp::ProgramDesc cpp_program_desc_;
-  std::map<size_t, std::string> input_names_;
-  std::map<std::string, size_t> idx2feeds_;
-  std::map<size_t, std::string> output_names_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
 };

 }  // namespace lite

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -32,8 +32,8 @@ class LightPredictorImpl : public PaddlePredictor {
  void Run() override;

  std::string GetVersion() const override;
-  std::vector<std::string> GetInputNames() override;
-  std::vector<std::string> GetOutputNames() override;
+  const std::vector<std::string>& GetInputNames() override;
+  const std::vector<std::string>& GetOutputNames() override;

  std::unique_ptr<const Tensor> GetTensor(
      const std::string& name) const override;
@@ -78,11 +78,11 @@ std::unique_ptr<Tensor> LightPredictorImpl::GetInputByName(
      new Tensor(raw_predictor_->GetInputByName(name)));
 }

-std::vector<std::string> LightPredictorImpl::GetInputNames() {
+const std::vector<std::string>& LightPredictorImpl::GetInputNames() {
  return raw_predictor_->GetInputNames();
 }

-std::vector<std::string> LightPredictorImpl::GetOutputNames() {
+const std::vector<std::string>& LightPredictorImpl::GetOutputNames() {
  return raw_predictor_->GetOutputNames();
 }


--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
@@ -36,12 +36,14 @@ TEST(LightAPI, load) {
    data[i] = i;
  }

-  std::vector<std::string> inputs = predictor.GetInputNames();
+  predictor.PrepareFeedFetch();
+  const std::vector<std::string>& inputs = predictor.GetInputNames();
+
  LOG(INFO) << "input size: " << inputs.size();
  for (int i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
-  std::vector<std::string> outputs = predictor.GetOutputNames();
+  const std::vector<std::string>& outputs = predictor.GetOutputNames();
  for (int i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -75,9 +75,9 @@ class LITE_API PaddlePredictor {
  virtual std::string GetVersion() const = 0;

  // Get input names
-  virtual std::vector<std::string> GetInputNames() = 0;
+  virtual const std::vector<std::string>& GetInputNames() = 0;
  // Get output names
-  virtual std::vector<std::string> GetOutputNames() = 0;
+  virtual const std::vector<std::string>& GetOutputNames() = 0;

  // Get Input by name
  virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -37,12 +37,12 @@ TEST(CxxApi, run) {

  LOG(INFO) << "Version: " << predictor->GetVersion();

-  std::vector<std::string> inputs = predictor->GetInputNames();
+  auto& inputs = predictor->GetInputNames();
  LOG(INFO) << "input size: " << inputs.size();
  for (int i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
-  std::vector<std::string> outputs = predictor->GetOutputNames();
+  auto& outputs = predictor->GetOutputNames();
  for (int i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }
@@ -76,14 +76,14 @@ TEST(LightApi, run) {

  auto predictor = lite_api::CreatePaddlePredictor(config);

-  std::vector<std::string> inputs = predictor->GetInputNames();
+  auto& inputs = predictor->GetInputNames();
  LOG(INFO) << "input size: " << inputs.size();
  for (int i = 0; i < inputs.size(); i++) {
-    LOG(INFO) << "inputnames: " << inputs[i];
+    LOG(INFO) << "inputnames: " << inputs.at(i);
  }
-  std::vector<std::string> outputs = predictor->GetOutputNames();
+  auto& outputs = predictor->GetOutputNames();
  for (int i = 0; i < outputs.size(); i++) {
-    LOG(INFO) << "outputnames: " << outputs[i];
+    LOG(INFO) << "outputnames: " << outputs.at(i);
  }

  LOG(INFO) << "Version: " << predictor->GetVersion();

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -35,6 +35,9 @@
 #include <sys/syscall.h>
 #include <unistd.h>
 #endif
+#ifdef LITE_WITH_ANDROID
+#include <sys/system_properties.h>
+#endif
 #if __APPLE__
 #include "TargetConditionals.h"
 #if LITE_WITH_IPHONE
@@ -218,6 +221,7 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
 #ifdef LITE_WITH_LINUX

 std::string get_cpu_name() {
+  std::string cpu_name;
  FILE* fp = fopen("/proc/cpuinfo", "rb");
  if (!fp) {
    return "";
@@ -229,12 +233,23 @@ std::string get_cpu_name() {
      break;
    }
    if (strstr(line, "Hardware") != NULL) {
-      fclose(fp);
-      return std::string(line);
+      cpu_name = std::string(line);
    }
  }
+#ifdef LITE_WITH_ANDROID
+  // cpu name concat board name, platform name and chip name
+  char board_name[128];
+  char platform_name[128];
+  char chip_name[128];
+  __system_property_get("ro.product.board", board_name);
+  __system_property_get("ro.board.platform", platform_name);
+  __system_property_get("ro.chipname", chip_name);
+  cpu_name =
+      cpu_name + "_" + board_name + "_" + platform_name + "_" + chip_name;
+#endif
+  std::transform(cpu_name.begin(), cpu_name.end(), cpu_name.begin(), ::toupper);
  fclose(fp);
-  return "";
+  return cpu_name;
 }

 int get_min_freq_khz(int cpuid) {
@@ -780,7 +795,9 @@ bool DeviceInfo::SetCPUInfoByName() {
    cluster_ids_ = {0, 0, 0, 0};
    SetArchInfo(1, kA53);
    return true;
-  } else if (dev_name_.find("KIRIN980") != std::string::npos) {  // Kirin 980
+  } else if (dev_name_.find("KIRIN980") != std::string::npos ||
+             dev_name_.find("KIRIN990") !=
+                 std::string::npos) {  // Kirin 980, Kirin 990
    core_num_ = 8;
    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
    big_core_ids_ = {4, 5, 6, 7};
@@ -1109,7 +1126,8 @@ void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
 }

 bool DeviceInfo::ExtendWorkspace(size_t size) {
-  workspace_.Resize({size + llc_size()});
+  workspace_.Resize(
+      {static_cast<int64_t>(size + static_cast<size_t>(llc_size()))});
  return workspace_.mutable_data<int8_t>() != nullptr;
 }


--- a/lite/core/types.cc
+++ b/lite/core/types.cc
@@ -82,6 +82,10 @@ Type StdTypeToRepr<double>() {
  return Type::_float64;
 }
 template <>
+Type StdTypeToRepr<std::vector<char>>() {
+  return Type::_char_list;
+}
+template <>
 Type StdTypeToRepr<std::string>() {
  return Type::_string;
 }

--- a/lite/core/types.h
+++ b/lite/core/types.h
@@ -16,6 +16,7 @@

 #include <stack>
 #include <string>
+#include <vector>
 #include "lite/api/paddle_place.h"
 #include "lite/utils/all.h"

@@ -36,7 +37,9 @@ enum class Type {
  _float64,
  _bool,
  _string,
-  // primary list types
+  // primary list type
+  _char_list,
+  // list types
  _list,
  // enum type
  _enum,
@@ -89,6 +92,8 @@ Type StdTypeToRepr<float>();
 template <>
 Type StdTypeToRepr<bool>();
 template <>
+Type StdTypeToRepr<std::vector<char>>();
+template <>
 Type StdTypeToRepr<std::string>();

 // Factors that impact the kernel picking strategy. Multiple factors can be

--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -39,6 +39,13 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  int pad = param.paddings[0];
  int stride = param.strides[0];

+  int chin = param.x->dims()[1];
+  int hin = param.x->dims()[2];
+  int win = param.x->dims()[3];
+  int chout = param.output->dims()[1];
+  int hout = param.output->dims()[2];
+  int wout = param.output->dims()[3];
+
  bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
                   (param.strides[0] == param.strides[1]) && (kw == kh);
  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
@@ -54,7 +61,7 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
    VLOG(3) << "invoking dw conv";
  } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
             no_dilation) {
-    if (ic >= 32 && oc >= 32) {
+    if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) {
      /// winograd conv impl
      impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
      VLOG(3) << "invoking winograd conv";
@@ -63,8 +70,8 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
      impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
      VLOG(3) << "invoking direct conv";
    }
-  } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal &&
-             no_dilation) {
+  } else if (param.groups == 1 && kw == 3 && stride == 2 &&
+             chin * chout < 4 * hin * win && kps_equal && no_dilation) {
    /// direct conv impl
    impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
    VLOG(3) << "invoking direct conv";

--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -32,6 +32,8 @@ nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_c
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
 nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
 nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
+nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
+nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
 nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )

--- a/lite/kernels/cuda/pool_compute_test.cc
+++ b/lite/kernels/cuda/pool_compute_test.cc
@@ -194,9 +194,9 @@ TEST(pool_cuda, compute) {
            for (auto stride : {1, 2}) {
              for (auto pad : {0, 1}) {
                for (auto n : {1, 2}) {
-                  for (auto c : {1, 3, 256}) {
-                    for (auto h : {2, 3, 4, 6, 13}) {
-                      for (auto w : {2, 3, 4, 6, 13}) {
+                  for (auto c : {1, 3}) {
+                    for (auto h : {2, 3, 4, 11}) {
+                      for (auto w : {2, 3, 4, 11}) {
                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
                                << " w:" << w << " ksize:" << ksize
                                << " stride:" << stride << " pad:" << pad

--- a/lite/kernels/cuda/softmax_compute_test.cc
+++ b/lite/kernels/cuda/softmax_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/softmax_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+using DDim = lite::DDim;
+
+template <typename dtype>
+static void softmax_compute_ref(const operators::SoftmaxParam& param) {
+  const dtype* x_data = param.x->mutable_data<const dtype>();
+  dtype* output_data = param.output->mutable_data<dtype>();
+  DDim x_dims = param.x->dims();
+  ASSERT_EQ(x_dims.data(), param.output->dims().data());
+  auto x_rank = x_dims.size();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis += x_rank;
+  }
+  int axis_size = x_dims[axis];
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+  int compute_size = outer_num * inner_num;
+  for (int i = 0; i < compute_size; i++) {
+    int idx_inner = i % inner_num;
+    int idx_outer = (i / inner_num) * axis_size;
+    int start = idx_outer * inner_num + idx_inner;
+    int offset;
+
+    offset = start;
+    dtype max_data = std::numeric_limits<dtype>::lowest();
+    for (int j = 0; j < axis_size; j++) {
+      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
+      offset += inner_num;
+    }
+
+    offset = start;
+    dtype sum_data = (dtype)0;
+    for (int j = 0; j < axis_size; j++) {
+      output_data[offset] = exp(x_data[offset] - max_data);
+      sum_data += output_data[offset];
+      offset += inner_num;
+    }
+
+    offset = start;
+    for (int j = 0; j < axis_size; j++) {
+      output_data[offset] /= sum_data;
+      offset += inner_num;
+    }
+  }
+}
+
+TEST(softmax_cuda, compute) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  SoftmaxCompute softmax;
+  operators::SoftmaxParam param;
+  softmax.SetContext(std::move(ctx));
+  lite::Tensor x;
+  lite::Tensor x_cpu;
+  lite::Tensor output;
+  lite::Tensor output_cpu;
+  lite::Tensor output_ref;
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 4}) {
+      for (auto h : {5, 1, 112}) {
+        for (auto w : {1, 6, 112}) {
+          for (auto axis : {-2, -1, 0, 1, 2}) {
+            x.Resize({n, c, h, w});
+            x_cpu.Resize({n, c, h, w});
+            output.Resize({n, c, h, w});
+            output_cpu.Resize({n, c, h, w});
+            output_ref.Resize({n, c, h, w});
+            auto* x_cpu_data = x_cpu.mutable_data<float>();
+            auto* output_data = output.mutable_data<float>(TARGET(kCUDA));
+            auto* output_cpu_data = output_ref.mutable_data<float>();
+            auto* output_ref_data = output_ref.mutable_data<float>();
+            for (int i = 0; i < x.dims().production(); i++) {
+              x_cpu_data[i] = i;
+            }
+            x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data,
+                                                       x_cpu.dims());
+            param.x = &x;
+            param.axis = axis;
+            param.output = &output;
+            softmax.SetParam(param);
+            softmax.Launch();
+            param.x = &x_cpu;
+            param.output = &output_ref;
+            softmax_compute_ref<float>(param);
+            cudaDeviceSynchronize();
+            CopySync<TARGET(kCUDA)>(output_cpu_data,
+                                    output_data,
+                                    sizeof(float) * output.numel(),
+                                    IoDirection::DtoH);
+            for (int i = 0; i < output.dims().production(); i++) {
+              EXPECT_NEAR(output_cpu_data[i], output_ref_data[i], 1e-5);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/yolo_box_compute_test.cc
+++ b/lite/kernels/cuda/yolo_box_compute_test.cc
@@ -89,7 +89,7 @@ inline static void calc_label_score(float* scores,

 template <typename T>
 static void YoloBoxRef(const T* input,
-                       const T* imgsize,
+                       const int* imgsize,
                       T* boxes,
                       T* scores,
                       const float conf_thresh,
@@ -106,8 +106,8 @@ static void YoloBoxRef(const T* input,
  float box[4];

  for (int i = 0; i < n; i++) {
-    int img_height = static_cast<int>(imgsize[2 * i]);
-    int img_width = static_cast<int>(imgsize[2 * i + 1]);
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];

    for (int j = 0; j < an_num; j++) {
      for (int k = 0; k < h; k++) {
@@ -184,12 +184,12 @@ TEST(yolo_box, normal) {
  auto* scores_data = scores.mutable_data<float>(TARGET(kCUDA));

  float* x_cpu_data = x_cpu.mutable_data<float>();
-  float* sz_cpu_data = sz_cpu.mutable_data<float>();
+  int* sz_cpu_data = sz_cpu.mutable_data<int>();
  float* boxes_cpu_data = boxes_cpu.mutable_data<float>();
  float* scores_cpu_data = scores_cpu.mutable_data<float>();

  float* x_ref_data = x_ref.mutable_data<float>();
-  float* sz_ref_data = sz_ref.mutable_data<float>();
+  int* sz_ref_data = sz_ref.mutable_data<int>();
  float* boxes_ref_data = boxes_ref.mutable_data<float>();
  float* scores_ref_data = scores_ref.mutable_data<float>();

@@ -203,7 +203,7 @@ TEST(yolo_box, normal) {
  sz_ref_data[1] = 32;

  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  sz.Assign<float, lite::DDim, TARGET(kCUDA)>(sz_cpu_data, sz_cpu.dims());
+  sz.Assign<int, lite::DDim, TARGET(kCUDA)>(sz_cpu_data, sz_cpu.dims());

  param.X = &x;
  param.ImgSize = &sz;

--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -727,10 +727,8 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,

  // Load model

-  std::string prog_path = model_buffer;
-
  naive_buffer::BinaryTable table;
-  table.LoadFromMemory(prog_path.c_str(), prog_path.length());
+  table.LoadFromMemory(model_buffer.c_str(), model_buffer.length());

  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
  nb_proto_prog.Load();
@@ -742,8 +740,7 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
  // Load Params
  // NOTE: Only main block be used now.
  // only combined Params are supported in Loading Model from memory
-  std::string combined_params_path = param_buffer;
-  LoadCombinedParamsNaive(combined_params_path, scope, *cpp_prog, true);
+  LoadCombinedParamsNaive(param_buffer, scope, *cpp_prog, true);

  VLOG(4) << "Load model from naive buffer memory successfully";
 }

--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
@@ -126,6 +126,41 @@ using UInt64Builder = PrimaryBuilder<uint64_t>;
 using Float32Builder = PrimaryBuilder<float>;
 using Float64Builder = PrimaryBuilder<double>;

+template <typename Primary>
+class PrimaryListBuilder : public FieldBuilder {
+  std::vector<Primary> data_;
+
+ public:
+  using value_type = Primary;
+
+  explicit PrimaryListBuilder(BinaryTable* table) : FieldBuilder(table) {}
+  PrimaryListBuilder(BinaryTable* table, const std::vector<Primary>& val)
+      : FieldBuilder(table), data_(val) {}
+
+  /// Set data.
+  void set(const std::vector<Primary>& x) { data_ = x; }
+
+  const std::vector<Primary>& data() const { return data_; }
+
+  /// Save information to the corresponding BinaryTable.
+  void Save() override;
+
+  /// Load information from the corresponding BinaryTable.
+  void Load() override;
+
+  /// Number of elements.
+  size_t size() const { return data_.size(); }
+
+  Type type() const override {
+    return core::StdTypeToRepr<std::vector<Primary>>();
+  }
+
+  /// clear builder
+  void Clear() { data_.clear(); }
+
+  ~PrimaryListBuilder() = default;
+};
+
 /*
 * Builder for all the primary types. int32, float, bool and so on.
 */
@@ -344,6 +379,36 @@ void PrimaryBuilder<Primary>::Load() {
  table()->Consume(sizeof(value_type));
 }

+template <typename Primary>
+void PrimaryListBuilder<Primary>::Load() {
+  CHECK(data_.empty()) << "Duplicate load";
+  // Load number of elements first.
+  uint64_t num_elems{};
+  memcpy(&num_elems, table()->cursor(), sizeof(uint64_t));
+  table()->Consume(sizeof(uint64_t));
+
+  data_.resize(num_elems);
+  for (uint64_t i = 0; i < num_elems; i++) {
+    memcpy(&data_[i], table()->cursor(), sizeof(value_type));
+    table()->Consume(sizeof(value_type));
+  }
+}
+
+template <typename Primary>
+void PrimaryListBuilder<Primary>::Save() {
+  // store number of elements in the head.
+  uint64_t num_elems = size();
+  table()->Require(sizeof(uint64_t));
+  memcpy(table()->cursor(), &num_elems, sizeof(uint64_t));
+  table()->Consume(sizeof(uint64_t));
+
+  table()->Require(num_elems * sizeof(value_type));
+  memcpy(table()->cursor(),
+         reinterpret_cast<byte_t*>(&data_[0]),
+         num_elems * sizeof(value_type));
+  table()->Consume(num_elems * sizeof(value_type));
+}
+
 template <typename EnumType>
 void EnumBuilder<EnumType>::Save() {
  value_type holder = static_cast<value_type>(data_);

--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ b/lite/model_parser/naive_buffer/param_desc.cc
@@ -149,15 +149,16 @@ void ParamDesc::SetDim(const std::vector<int64_t>& dim) {
    CHECK(GetDataType() == VarDescAPI::VarDataType::type__)                 \
        << "Data Type mismatch";                                            \
    std::vector<T> res;                                                     \
-    auto& data_builder = desc_->GetField<ListBuilder<CharBuilder>>("data"); \
-    auto data = RepeatedToVector<char, CharBuilder>(data_builder);          \
+    auto& data_builder = desc_->GetField<PrimaryListBuilder<char>>("data"); \
+    auto& data = data_builder.data();                                       \
    size_t size = data.size() / sizeof(T);                                  \
-    auto* data_ptr = reinterpret_cast<T*>(&data[0]);                        \
+    auto* data_ptr = reinterpret_cast<const T*>(&data[0]);                  \
    for (size_t i = 0; i < size; ++i) {                                     \
      res.push_back(data_ptr[i]);                                           \
    }                                                                       \
    return res;                                                             \
  }
+
 GET_DATA_IMPL(uint8_t, UINT8);
 GET_DATA_IMPL(int8_t, INT8);
 GET_DATA_IMPL(int16_t, INT16);
@@ -172,14 +173,13 @@ GET_DATA_IMPL(double, FP64);
  CHECK(GetDataType() == VarDescAPI::VarDataType::type__)       \
      << "Data Type mismatch, call SetDataType first.";         \
  auto* data_builder =                                          \
-      desc_->GetMutableField<ListBuilder<CharBuilder>>("data"); \
+      desc_->GetMutableField<PrimaryListBuilder<char>>("data"); \
  CHECK(data_builder);                                          \
  data_builder->Clear();                                        \
  size_t size = size__ * sizeof(T);                             \
  auto* data_ptr = reinterpret_cast<const char*>(data_ptr__);   \
-  for (size_t i = 0; i < size; ++i) {                           \
-    data_builder->New()->set(data_ptr[i]);                      \
-  }
+  std::vector<char> data_vec(data_ptr, data_ptr + size);        \
+  data_builder->set(data_vec);

 #define SET_DATA_IMPL(T, type__)                                \
  template <>                                                   \

--- a/lite/model_parser/naive_buffer/proto/framework.nb.h
+++ b/lite/model_parser/naive_buffer/proto/framework.nb.h
@@ -191,7 +191,7 @@ class ParamDesc : public StructBuilder {
    New<lod_type>("lod");
    NewUInt32("tensor_version");
    New<TensorDesc>("tensor_desc");
-    New<ListBuilder<CharBuilder>>("data");
+    New<PrimaryListBuilder<char>>("data");
  }
 };


--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/operators/conv_op.h"
+#include <algorithm>
 #include <vector>
 #include "lite/core/op_registry.h"

@@ -51,10 +52,41 @@ inline int ConvOutputSize(
  return output_size;
 }

+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                   (int64_t)0);
+      // pad
+      *(paddings->begin() + i) = pad_sum / 2;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 bool ConvOpLite::InferShape() const {
  const auto in_dims = param_.x->dims();
  const auto filter_dims = param_.filter->dims();

+  UpdatePaddingAndDilation(&param_.paddings,
+                           &param_.dilations,
+                           param_.strides,
+                           padding_algorithm_,
+                           in_dims,
+                           filter_dims);
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < param_.strides.size(); ++i) {
    output_shape.push_back(ConvOutputSize(in_dims[i + 2],

--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -93,6 +93,10 @@ class ConvOpLite : public OpLite {
            << "The fused conv only supports fuse with relu and leaky relu";
      }
    }
+
+    if (op_desc.HasAttr("padding_algorithm")) {
+      padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+    }
    // For Int8
    if (op_desc.HasAttr("enable_int8")) {
      param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
@@ -114,6 +118,7 @@ class ConvOpLite : public OpLite {

 private:
  mutable ConvParam param_;
+  std::string padding_algorithm_{""};
 };

 }  // namespace operators